In [3]:
#TF-IDF, which stands for Term Frequency-Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'this is the first document',
    'this document is the second document',
    'and this is the third one',
    'is this the first document'
]



In [5]:
v=TfidfVectorizer()
transfered_output=v.fit_transform(corpus)
print(v.vocabulary_)

# fit() → learns the vocabulary (the unique words in your corpus) and computes the IDF values.

# transform() → creates a sparse matrix with the TF-IDF values for each document.

# ✅ v.vocabulary_
# This is a dictionary that maps each word (feature) to the column index in the TF-IDF matrix.

# 'this' appears in column 8 of the matrix

# 'document' appears in column 0

# 'first' appears in column 2

{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}


In [None]:
dir(v)

In [8]:
all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    indx = v.vocabulary_.get(word)
    print(f"{word}: {v.idf_[indx]}")

#     1️⃣ v.get_feature_names_out()
# → Gets all unique words (features) learned from your corpus — in alphabetical order.

# 2️⃣ The for loop
# → Goes through each word in this list.

# 3️⃣ v.vocabulary_.get(word)
# → Finds out which column index this word corresponds to in the TF-IDF matrix.

# 4️⃣ v.idf_[indx]
# → Gets the IDF (Inverse Document Frequency) value for that word’s column.

# 5️⃣ print(f"{word}: {v.idf_[indx]}")
# → Prints each word along with its IDF value (which shows how rare or unique the word is across all documents).

# 💡 Purpose
# ✅ This code helps you see how important or rare each word is (via its IDF score) in your document collection.
# ✅ High IDF → word is rare across documents → more useful for distinguishing documents.
# ✅ Low IDF → word is common → less useful for distinguishing documents.

and: 1.916290731874155
document: 1.2231435513142097
first: 1.5108256237659907
is: 1.0
one: 1.916290731874155
second: 1.916290731874155
the: 1.0
third: 1.916290731874155
this: 1.0


In [9]:
corpus[:2]

['this is the first document', 'this document is the second document']

In [10]:
transfered_output

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 21 stored elements and shape (4, 9)>

In [11]:
transfered_output.toarray()[:2]

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867]])

In [20]:
import pandas as pd
df=pd.read_csv("Ecommerce_Data.csv")
df.head()
print(df.shape)

(24000, 2)


In [22]:
df.label.value_counts() #to count the occurrences of unique values within a Series or a DataFrame column.

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [25]:
df['label_num']=df.label.map({
    'Household':0,
    'Books':1,
    'Electronics':2,
    'Clothing & Accessories':3
})
df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [27]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(df.Text,df.label_num,test_size=0.2,random_state=2022,stratify=df.label_num)

In [29]:
print(X_train.shape)
print(X_test.shape)

(19200,)
(4800,)


In [30]:
y_train.value_counts()

label_num
0    4800
2    4800
3    4800
1    4800
Name: count, dtype: int64

In [33]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

# ✅ TfidfVectorizer() → converts your text data into numeric TF-IDF features.
# ✅ KNeighborsClassifier() → applies K-NN classification on these features.
# ✅ Pipeline → chains them together so when you call fit() or predict(), it automatically transforms text → TF-IDF → predicts.
# ✅ classification_report → gives you precision, recall, f1-score for each class

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.97      0.95      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.98      0.97      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [34]:
X_test[:4][19166]

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19166    GOTOTOP Classical Retro Cotton & PU Leather Ne...
15209    FabSeasons Camouflage Polyester Multi Function...
2462     Indian Superfoods: Change the Way You Eat Revi...
Name: Text, dtype: object

In [36]:
X_test[:4][19166] #electronic

'GOTOTOP Classical Retro Cotton & PU Leather Neck Shoulder Strap Anti-Slip for SLR DSLR Cameras (Charcoal Grey) Colour:Charcoal Grey   Specifications: Material: Cotton + PU Leather  Color: Charcoal Grey, Brown (as show in the pictures)  Weight: approx. 40g  Main Belt Length: approx. 70cm/27.55"  Width: approx. 3.5cm/1.37"  Fit For: All kind of brand SLR cameras, Part of micro single cameras  Package Included: 1 x camera shoulder neck strap (The camera is not included)'

In [35]:
y_pred[:4]

array([0, 2, 3, 1])

Use text pre-processing to remove stop words, punctuations and apply lemmatization

In [38]:
### utlity function for pre-processing the text
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [None]:

df['preprocessed_txt'] = df['Text'].apply(preprocess) 
df.head()

In [None]:
df.Text[0]

In [None]:
df.preprocessed_txt[0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_txt, 
    df.label_num,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label_num
)