We will use TF-IDF approach with Sklearn to automatically classify comments from an e-commerce site based on 4 categories.

Dataset: https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification

In [34]:
import pandas as pd

df = pd.read_csv("ecommerce.csv")
df.head()

Unnamed: 0,Label,Text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [35]:
# Check if there is a huge unbalance
df.Label.value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: Label, dtype: int64

In [86]:
# Transform those categorical label into numbers and put it into a new column
df['Label_num'] = df.Label.map({
    'Household':int(0),
    'Books ':int(1),
    'Electronics':int(2),
    'Clothing & Accessories':int(3),
})

In [87]:
df.head()

Unnamed: 0,Label,Text,Label_num
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [88]:
#Search and drop null values

df['Label_num'].isnull().values.any()

False

In [89]:
df.dropna(inplace=True)

In [90]:
df['Label_num'].isnull().values.any()

False

In [91]:
df.head()

Unnamed: 0,Label,Text,Label_num
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


## X and y

In [92]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text,df.Label_num, 
    test_size=0.25, 
    random_state =2000, 
    stratify = df.Label_num
)

In [93]:
X_train_shape, X_test_shape = X_train.shape,X_test.shape
X_train_shape, X_test_shape

((28953,), (9651,))

In [94]:
y_train.value_counts()

0    14485
2     7966
3     6502
Name: Label_num, dtype: int64

# Building the model

In [95]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [96]:
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('KNN', KNeighborsClassifier())         
])

# In the first stage we will have TfidfVectorizer then KNeighborsClassifier

In [97]:
clf.fit(X_train,y_train) #Train the model
y_pred = clf.predict(X_test) #Save the prediction to compare it with the truth

In [98]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      4828
           2       0.97      0.96      0.96      2655
           3       0.97      0.97      0.97      2168

    accuracy                           0.97      9651
   macro avg       0.97      0.97      0.97      9651
weighted avg       0.97      0.97      0.97      9651



In [99]:
X_test[:5]

  X_test[:5]


11576    AmazonBasics Premium Stainless Steel Knife Set...
921      Chilifry Wooden & Iron Rocking Chair (Multi-co...
43134    Sony SRS-XB10 EXTRA BASS Portable Splash-proof...
5241     Paradigm Pictures Feng Shui Items for Home Dec...
14376    Shomy 8mm to 6mm : 5Pcs Pneumatic Fittings Pus...
Name: Text, dtype: object

In [104]:
y_test[:5]

  y_test[:5]


11576    0
921      0
43134    2
5241     0
14376    0
Name: Label_num, dtype: int64

In [106]:
y_pred[:5]
# Got yhem all right

array([0, 0, 2, 0, 0], dtype=int64)

In [107]:
# Let's explore the 3rd one
X_test[:5][43134] # Electronics

  X_test[:5][43134]


'Sony SRS-XB10 EXTRA BASS Portable Splash-proof Wireless Speaker with Bluetooth and NFC (Black) Colour:Black   Wherever you go, take some big beats with you. The SRS-XB10 is easy to carry and brings out the bass in your music. Its splash-proof features let you to enjoy your music anywhere indoor/outdoor. You can match it to your style quotient by choosing from 4 attractive colour options. The 16 Hours battery time lets you enjoy your favorite music for long hours.For any query please contact_us on: [ 1800 103 7799 ]'

## Let's try another classifyer

In [108]:
from sklearn.naive_bayes import MultinomialNB

clf2 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('NB', MultinomialNB())         
])

In [109]:
clf2.fit(X_train,y_train)

In [113]:
y_pred2 = clf2.predict(X_test)
print(classification_report(y_test,y_pred2))

#less than knn

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      4828
           2       0.97      0.93      0.95      2655
           3       0.99      0.94      0.96      2168

    accuracy                           0.96      9651
   macro avg       0.97      0.95      0.96      9651
weighted avg       0.96      0.96      0.96      9651



## Let's try RFC

In [114]:
from sklearn.ensemble import RandomForestClassifier

clf3 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('RF', RandomForestClassifier())         
])

In [115]:
clf3.fit(X_train,y_train)

In [116]:
y_pred3 = clf3.predict(X_test)
print(classification_report(y_test,y_pred3))

#The best

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4828
           2       0.99      0.95      0.97      2655
           3       0.98      0.98      0.98      2168

    accuracy                           0.98      9651
   macro avg       0.98      0.97      0.98      9651
weighted avg       0.98      0.98      0.98      9651



# Preprocessing and fit again the model

Now we will use the same data and do preprocessing, like removing stop word.

In [118]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [122]:
# Creating a function for preprocessing

def preprocess(Text):
    # remove stop words and lemmatize the text
    doc = nlp(Text)
    filtread_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct: # you just ignore it
            continue
        filtread_tokens.append(token.lemma_) # if it's not you do lemmatization
    return" ".join(filtread_tokens)

In [123]:
df['preprocessed_txt'] = df['Text'].apply(preprocess) # You need to wait ":)"

In [124]:
df.head()
#Now we have a new column containing the filtred text

Unnamed: 0,Label,Text,Label_num,preprocessed_txt
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0,paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0,SAF Floral Framed Painting Wood 30 inch x 10 i...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0,SAF uv texture Modern Art Print Framed Paintin...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0,SAF Flower Print Framed Painting Synthetic 13....
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0,incredible gift India Wooden Happy Birthday Un...


Let's see the difference

In [125]:
df.Text[0]

'Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal blis

In [126]:
df.preprocessed_txt[0]

'paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints 8.7 x 8.7 inch Set 4 painting synthetic frame uv textured print give multi effect attract special series painting make wall beautiful give royal touch painting ready hang proud possess unique painting niche apart use modern efficient printing technology print ink precision epson roland hp printer innovative hd printing technique result durable spectacular look print high lifetime print solely notch 100 ink achieve brilliant true colour high level uv resistance print retain beautiful colour year add colour style live space digitally print painting pleasure eternal bliss.so bring home elegant print lushe rich color make sheer elegance friend family.it treasure forever lucky recipient liven place intriguing painting high definition hd graphic digital print home office room'

# X and y

In [127]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_txt,df.Label_num, 
    test_size=0.25, 
    random_state =2000, 
    stratify = df.Label_num
)

In [128]:
from sklearn.ensemble import RandomForestClassifier

clf4 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('RF', RandomForestClassifier())         
])

clf4.fit(X_train,y_train)

In [130]:
y_pred4 = clf4.predict(X_test)
print(classification_report(y_test,y_pred4))

#98,97,99 with preprocessed txt, A little better result!

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4828
           2       0.98      0.96      0.97      2655
           3       0.99      0.99      0.99      2168

    accuracy                           0.98      9651
   macro avg       0.98      0.98      0.98      9651
weighted avg       0.98      0.98      0.98      9651

