In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

### What is TF-IDF?
- TF stands for Term Frequency and denotes the ratio of number of times a particular word appeared in a Document to total number of words in the document.

   Term Frequency(TF) = [number of times word appeared / total no of words in a document]
- Term Frequency values ranges between 0 and 1. If a word occurs more number of times, then it's value will be close to 1.

- IDF stands for Inverse Document Frequency and denotes the log of ratio of total number of documents/datapoints in the whole dataset to the number of documents that contains the particular word.

  Inverse Document Frequency(IDF) = [log(Total number of documents / number of documents that contains the word)]
- In IDF, if a word occured in more number of documents and is common across all documents, then it's value will be less and ratio will approaches to 0.
- Finally:

   TF-IDF = Term Frequency(TF) * Inverse Document Frequency(IDF)

In [2]:
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [3]:
v = TfidfVectorizer()
transormed_output = v.fit_transform(corpus)
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [4]:
print(transormed_output)

  (0, 0)	0.2426654728284301
  (0, 7)	0.2426654728284301
  (0, 15)	0.2426654728284301
  (0, 16)	0.11527032701364152
  (0, 17)	0.2426654728284301
  (0, 22)	0.7279964184852903
  (0, 10)	0.40286636477562926
  (0, 25)	0.2426654728284301
  (1, 26)	0.30652086071532464
  (1, 14)	0.5680354003049032
  (1, 20)	0.30652086071532464
  (1, 4)	0.30652086071532464
  (1, 5)	0.5680354003049032
  (1, 16)	0.26982671076064085
  (2, 19)	0.5680354003049032
  (2, 24)	0.5680354003049032
  (2, 26)	0.30652086071532464
  (2, 20)	0.30652086071532464
  (2, 4)	0.30652086071532464
  (2, 16)	0.26982671076064085
  (3, 21)	0.5680354003049032
  (3, 12)	0.5680354003049032
  (3, 26)	0.30652086071532464
  (3, 20)	0.30652086071532464
  (3, 4)	0.30652086071532464
  (3, 16)	0.26982671076064085
  (4, 23)	0.5680354003049032
  (4, 18)	0.5680354003049032
  (4, 26)	0.30652086071532464
  (4, 20)	0.30652086071532464
  (4, 4)	0.30652086071532464
  (4, 16)	0.26982671076064085
  (5, 9)	0.4939131624859277
  (5, 11)	0.4939131624859277
  (5

In [5]:
all_features_names = v.get_feature_names_out()

for word in all_features_names :
    index = v.vocabulary_.get(word) #to get the index of the word
    print(f"{word} {v.idf_[index]}")

already 2.386294361119891
am 2.386294361119891
amazon 2.386294361119891
and 2.386294361119891
announcing 1.2876820724517808
apple 2.386294361119891
are 2.386294361119891
ate 2.386294361119891
biryani 2.386294361119891
dot 2.386294361119891
eating 1.9808292530117262
eco 2.386294361119891
google 2.386294361119891
grapes 2.386294361119891
iphone 2.386294361119891
ironman 2.386294361119891
is 1.1335313926245225
loki 2.386294361119891
microsoft 2.386294361119891
model 2.386294361119891
new 1.2876820724517808
pixel 2.386294361119891
pizza 2.386294361119891
surface 2.386294361119891
tesla 2.386294361119891
thor 2.386294361119891
tomorrow 1.2876820724517808
you 2.386294361119891


In [6]:
corpus[:2]

['Thor eating pizza, Loki is eating pizza, Ironman ate pizza already',
 'Apple is announcing new iphone tomorrow']

In [7]:
transormed_output.toarray()[:2]

array([[0.24266547, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.24266547, 0.        , 0.        ,
        0.40286636, 0.        , 0.        , 0.        , 0.        ,
        0.24266547, 0.11527033, 0.24266547, 0.        , 0.        ,
        0.        , 0.        , 0.72799642, 0.        , 0.        ,
        0.24266547, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.30652086,
        0.5680354 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.5680354 ,
        0.        , 0.26982671, 0.        , 0.        , 0.        ,
        0.30652086, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.30652086, 0.        ]])

####  Problem Statement: Given a description about a product sold on e-commerce website, classify it in one of the 4 categories


- <h4>Text:</h4> Description of an item sold on e-commerce website
- <h4>Label:</h4> Category of that item. Total 4 categories: "Electronics", "Household", "Books" and "Clothing & Accessories", which almost cover 80% of any E-commerce website.

In [8]:
df = pd.read_csv("Ecommerce_data.csv")
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [9]:
df.shape

(24000, 2)

In [10]:
df.label.value_counts()  #The data is balanced

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [11]:
df['label_num'] = df.label.map({
    'Household' : 0,
    'Electronics' : 1,
    'Clothing & Accessories' : 2,
    'Books' : 3
})

In [12]:
df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


### Train test split


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text,
    df.label_num,
    test_size = 0.2,
    random_state = 2022,
    stratify = df.label_num
)    

In [14]:
X_train.shape

(19200,)

In [15]:
X_test.shape

(4800,)

In [16]:
y_train.value_counts()

label_num
0    4800
2    4800
3    4800
1    4800
Name: count, dtype: int64

In [17]:
y_test.value_counts()

label_num
0    1200
2    1200
3    1200
1    1200
Name: count, dtype: int64

### Create a classification pipeline to classify the Ecommerce Data:

#### 1- Using KNN as the classifier

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),
    ("KNN", KNeighborsClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.96      0.97      0.97      1200
           2       0.97      0.98      0.98      1200
           3       0.98      0.95      0.96      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [19]:
X_test[:5]

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19008    tirupur fashion biz Girls and Kids Solid Cotto...
14810    Modern Linguistics: An Introduction About The ...
2451     AmazonBasics Apple Certified 30-Pin to USB Cab...
6296     The Marine Corps Martial Arts Program: The Com...
Name: Text, dtype: object

In [20]:
y_test[:5]

20706    0
19008    2
14810    3
2451     1
6296     3
Name: label_num, dtype: int64

In [21]:
y_pred[:5]

array([0, 2, 3, 1, 3], dtype=int64)

#### 2- Using MultinomialNB as the classifier

In [22]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),
    ("Multi NB", MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95      1200
           1       0.96      0.96      0.96      1200
           2       0.97      0.98      0.98      1200
           3       0.98      0.93      0.95      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [23]:
y_pred[:5]

array([0, 2, 3, 1, 3], dtype=int64)

#### 3- Using Random Forest as the classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),
    ("Random Forest", RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.97      0.98      0.97      1200
           2       0.98      0.98      0.98      1200
           3       0.98      0.97      0.97      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [25]:
y_pred[:5]

array([0, 2, 3, 1, 3], dtype=int64)

<h3>Use text pre-processing to remove stop words, punctuations and apply lemmatization</h3>

In [29]:
nlp = spacy.load("en_core_web_sm")

#Remove stop words ans punctiation and do lemmetization to the tokens
def preprocess(text) :
    doc = nlp(text)
    filtred_tokens = []
    for token in doc :
        if token.is_punct or token.is_stop :
            continue
        filtred_tokens.append(token.lemma_)
    return " ".join(filtred_tokens)

In [30]:
df['preprocess_text'] = df['Text'].apply(preprocess)

In [31]:
df.head()

Unnamed: 0,Text,label,label_num,preprocess_text
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2,Indira Designer Women Art Mysore Silk Saree Bl...


### Build a model with pre processed text

In [32]:
#do a new plit with the new filtred text

X_train, X_test, y_train, y_test = train_test_split(
    df.preprocess_text,
    df.label_num,
    test_size = 0.2,
    random_state = 2022,
    stratify = df.label_num
)    

In [33]:
df.Text[0]

'Urban Ladder Eisner Low Back Study-Office Computer Chair(Black) A study in simple. The Eisner study chair has a firm foam cushion, which makes long hours at your desk comfortable. The flexible meshed back is designed for air-circulation and support when you lean back. The curved arms provide ergonomic forearm support. Adjust the height using the gas lift to find that comfortable position and the nylon castors make it easy to move around your space. Chrome legs refer to the images for dimension details any assembly required will be done by the UL team at the time of delivery indoor use only.'

In [34]:
df.preprocess_text[0]

'Urban Ladder Eisner low Study Office Computer Chair(Black study simple Eisner study chair firm foam cushion make long hour desk comfortable flexible mesh design air circulation support lean curved arm provide ergonomic forearm support adjust height gas lift find comfortable position nylon castor easy space chrome leg refer image dimension detail assembly require UL team time delivery indoor use'

Let's check the scores with our best model till now

In [35]:
clf = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),
    ("Random Forest", RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1200
           1       0.98      0.97      0.98      1200
           2       0.98      0.99      0.99      1200
           3       0.98      0.97      0.98      1200

    accuracy                           0.98      4800
   macro avg       0.98      0.98      0.98      4800
weighted avg       0.98      0.98      0.98      4800



### Plot confusion matrix



In [36]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[1167,   12,   13,    8],
       [  23, 1170,    1,    6],
       [   7,    4, 1185,    4],
       [  14,   12,    6, 1168]], dtype=int64)