<a href="https://colab.research.google.com/github/abhigyan2003/NLP/blob/main/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Use the following dataset - https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [5]:
corpus = [
    "I like cats",
    "You like dogs",
    "Cats are nice",
    "Dogs are friendly",
    "I have a dog"
]


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd


In [7]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)


In [8]:
print("Vocabulary:\n")
print(vectorizer.vocabulary_)


Vocabulary:

{'like': 6, 'cats': 1, 'you': 8, 'dogs': 3, 'are': 0, 'nice': 7, 'friendly': 4, 'have': 5, 'dog': 2}


In [9]:
# BoW feature names
feature_names = vectorizer.get_feature_names_out()

# Convert BoW matrix to DataFrame
bow_df = pd.DataFrame(X.toarray(), columns=feature_names)

# Add original sentence for reference
bow_df.insert(0, "Sentence", corpus)

# Display full DataFrame
print(bow_df)


            Sentence  are  cats  dog  dogs  friendly  have  like  nice  you
0        I like cats    0     1    0     0         0     0     1     0    0
1      You like dogs    0     0    0     1         0     0     1     0    1
2      Cats are nice    1     1    0     0         0     0     0     1    0
3  Dogs are friendly    1     0    0     1         1     0     0     0    0
4       I have a dog    0     0    1     0         0     1     0     0    0


In [10]:
vectorizer = CountVectorizer(ngram_range=(3, 3))

In [11]:
# Fit the vectorizer and transform the corpus into BoW matrix
X = vectorizer.fit_transform(corpus)


In [12]:
# Show the vocabulary learned (n-gram terms with their index)
print("Vocabulary (unigrams + bigrams):\n")
print(vectorizer.vocabulary_)


Vocabulary (unigrams + bigrams):

{'you like dogs': 2, 'cats are nice': 0, 'dogs are friendly': 1}


In [13]:
# Convert the matrix to a readable DataFrame
feature_names = vectorizer.get_feature_names_out()
bow_df = pd.DataFrame(X.toarray(), columns=feature_names)

# Insert the original sentence as the first column
bow_df.insert(0, "Sentence", corpus)

# Display the final DataFrame
print("\nBag-of-Words (Bigrams Only):\n")
from IPython.display import display
display(bow_df)




Bag-of-Words (Bigrams Only):



Unnamed: 0,Sentence,cats are nice,dogs are friendly,you like dogs
0,I like cats,0,0,0
1,You like dogs,0,0,1
2,Cats are nice,1,0,0
3,Dogs are friendly,0,1,0
4,I have a dog,0,0,0


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


In [15]:
X = vectorizer.fit_transform(corpus)


In [16]:
print("Bigram Vocabulary:")
print(vectorizer.vocabulary_)


Bigram Vocabulary:
{'you like dogs': 2, 'cats are nice': 0, 'dogs are friendly': 1}


In [17]:
feature_names = vectorizer.get_feature_names_out()
bow_df = pd.DataFrame(X.toarray(), columns=feature_names)
bow_df.insert(0, "Sentence", corpus)

display(bow_df)


Unnamed: 0,Sentence,cats are nice,dogs are friendly,you like dogs
0,I like cats,0,0,0
1,You like dogs,0,0,1
2,Cats are nice,1,0,0
3,Dogs are friendly,0,1,0
4,I have a dog,0,0,0


In [25]:
# Only bigrams with TF-IDF weighting
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(2, 2))


In [26]:
# Generate TF-IDF matrix
X_tfidf = vectorizer.fit_transform(corpus)


In [27]:
print("TF-IDF Bigram Vocabulary:\n")
print(vectorizer.vocabulary_)


TF-IDF Bigram Vocabulary:

{'like cats': 5, 'you like': 7, 'like dogs': 6, 'cats are': 2, 'are nice': 1, 'dogs are': 3, 'are friendly': 0, 'have dog': 4}


In [28]:
# Extract feature names (bigrams)
feature_names = vectorizer.get_feature_names_out()

# Create TF-IDF DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=feature_names)

# Add original sentence for reference
tfidf_df.insert(0, "Sentence", corpus)

# Display the final DataFrame
display(tfidf_df)


Unnamed: 0,Sentence,are friendly,are nice,cats are,dogs are,have dog,like cats,like dogs,you like
0,I like cats,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,You like dogs,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107
2,Cats are nice,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0
3,Dogs are friendly,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0
4,I have a dog,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
