##### TF-IDF Vectorizer = Term Frequency - Inverse Document Frequency  Vectorizer:
##### IDF = log[(Total Number Of Docs) / (Number Docs where Term 't' appears)]
##### TF = (Number of time term 't' appears in a Doc) / (total number of terms/tokens in the Doc)
##### TF-IDF = TF * IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes",
    "something is amazing"
]

In [14]:
tf_Idf_Vec = TfidfVectorizer()
# Fitting the corpus to generate vocaulary and then generating the tf-idf values vectors:  
transformed_output = tf_Idf_Vec.fit_transform(corpus)
print(tf_Idf_Vec.vocabulary_)

{'thor': 27, 'eating': 11, 'pizza': 23, 'loki': 18, 'is': 17, 'ironman': 16, 'ate': 8, 'already': 0, 'apple': 6, 'announcing': 5, 'new': 21, 'iphone': 15, 'tomorrow': 28, 'tesla': 26, 'model': 20, 'google': 13, 'pixel': 22, 'microsoft': 19, 'surface': 25, 'amazon': 3, 'eco': 12, 'dot': 10, 'am': 1, 'biryani': 9, 'and': 4, 'you': 29, 'are': 7, 'grapes': 14, 'something': 24, 'amazing': 2}


In [15]:
#Extracting all feature names from vocabulary:
all_feature_names = tf_Idf_Vec.get_feature_names_out()

In [16]:
#Printing IDF Scores:
for word in all_feature_names:
    indx = tf_Idf_Vec.vocabulary_.get(word)
    print(f"[{word} : {tf_Idf_Vec.idf_[indx]}]")

[already : 2.504077396776274]
[am : 2.504077396776274]
[amazing : 2.504077396776274]
[amazon : 2.504077396776274]
[and : 2.504077396776274]
[announcing : 1.4054651081081644]
[apple : 2.504077396776274]
[are : 2.504077396776274]
[ate : 2.504077396776274]
[biryani : 2.504077396776274]
[dot : 2.504077396776274]
[eating : 2.09861228866811]
[eco : 2.504077396776274]
[google : 2.504077396776274]
[grapes : 2.504077396776274]
[iphone : 2.504077396776274]
[ironman : 2.504077396776274]
[is : 1.1177830356563834]
[loki : 2.504077396776274]
[microsoft : 2.504077396776274]
[model : 2.504077396776274]
[new : 1.4054651081081644]
[pixel : 2.504077396776274]
[pizza : 2.504077396776274]
[something : 2.504077396776274]
[surface : 2.504077396776274]
[tesla : 2.504077396776274]
[thor : 2.504077396776274]
[tomorrow : 1.4054651081081644]
[you : 2.504077396776274]


In [17]:
corpus[:2]

['Thor eating pizza, Loki is eating pizza, Ironman ate pizza already',
 'Apple is announcing new iphone tomorrow']

In [18]:
#Printing TF-IDF Vector for the above to strings:
transformed_output.toarray()[:2]

array([[0.24247317, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.24247317, 0.        ,
        0.        , 0.40642288, 0.        , 0.        , 0.        ,
        0.        , 0.24247317, 0.10823643, 0.24247317, 0.        ,
        0.        , 0.        , 0.        , 0.7274195 , 0.        ,
        0.        , 0.        , 0.24247317, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.31652498, 0.5639436 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5639436 , 0.        , 0.25173606, 0.        , 0.        ,
        0.        , 0.31652498, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.31652498, 0.        ]])

#### Ecommerce Product Label/Category Classification Problem

In [2]:
import pandas as pd

df = pd.read_csv("Ecommerce_data.csv")

df.shape

(24000, 2)

In [3]:
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [4]:
df.label.value_counts() #To Check Imbalance in Category/Label Wise Data

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [5]:
#Convert Label text to numbers:
target = {"Household":0, "Electronics":1 , "Clothing & Accessories":2, "Books":3}

df["label_num"] = df.label.map(target)

In [6]:
df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


In [8]:
#Split Train Test DS:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    df.Text, #Independent Variable
    df.label_num, #Dependent Variable
    test_size=0.2,
    random_state=2022,
    stratify=df.label_num
)

In [10]:
print(X_train.shape)
print(X_test.shape)

(19200,)
(4800,)


In [11]:
y_train.value_counts()

label_num
0    4800
2    4800
3    4800
1    4800
Name: count, dtype: int64

In [12]:
y_test.value_counts()

label_num
0    1200
2    1200
3    1200
1    1200
Name: count, dtype: int64

In [19]:
#Using KNN Classifier:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

knn_classifier = Pipeline([
    ('vectorizer_tf_idf',TfidfVectorizer()),
    ('KNN_clf',KNeighborsClassifier())
])

#Train
knn_classifier.fit(X_train,y_train)

#Predict
y_pred = knn_classifier.predict(X_test)

#Performance Eval
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.96      0.97      0.97      1200
           2       0.97      0.98      0.98      1200
           3       0.98      0.95      0.96      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [25]:
X_test[:5]

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19008    tirupur fashion biz Girls and Kids Solid Cotto...
14810    Modern Linguistics: An Introduction About The ...
2451     AmazonBasics Apple Certified 30-Pin to USB Cab...
6296     The Marine Corps Martial Arts Program: The Com...
Name: Text, dtype: object

##### "Household" : 0, "Electronics" : 1 , "Clothing & Accessories" : 2, "Books" : 3

In [22]:
y_test[:5]

20706    0
19008    2
14810    3
2451     1
6296     3
Name: label_num, dtype: int64

In [23]:
y_pred[:5]

array([0, 2, 3, 1, 3], dtype=int64)

In [26]:
#Using NaiveBayes Classifier
from sklearn.naive_bayes import MultinomialNB

nb_classifier = Pipeline([
    ('vectorizer_tf_idf',TfidfVectorizer()),
    ('MNB_clf',MultinomialNB())
])

#Train
nb_classifier.fit(X_train,y_train)

#Predict
y_pred = nb_classifier.predict(X_test)

#Performance Eval
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95      1200
           1       0.96      0.96      0.96      1200
           2       0.97      0.98      0.98      1200
           3       0.98      0.93      0.95      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [27]:
#Using Random Forest Classifier: (The Best)
from sklearn.ensemble import RandomForestClassifier

rf_classifier = Pipeline([
    ('vectorizer_tf_idf',TfidfVectorizer()),
    ('RF_clf',RandomForestClassifier())
])

#Train
rf_classifier.fit(X_train,y_train)

#Predict
y_pred = rf_classifier.predict(X_test)

#Performance Eval
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.97      0.97      0.97      1200
           2       0.98      0.99      0.98      1200
           3       0.98      0.97      0.97      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800

