In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ecommerce-text-classification/ecommerceDataset.csv


This is the classification based E-commerce text dataset for 4 categories - "Electronics", "Household", "Books" and "Clothing & Accessories", which almost cover 80% of any E-commerce website.

In [7]:
#loading the dataset
df = pd.read_csv("/kaggle/input/ecommerce-text-classification/ecommerceDataset.csv",header=None,
                 names=['label','review'])
print(df.shape)
df.head()

(50425, 2)


Unnamed: 0,label,review
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [9]:
#searching for NAN values in our dataset
df.isna().sum()

label     0
review    1
dtype: int64

In [20]:
#drop the record with a NAN value
df.drop(df[df['review'].isna()].index[0],inplace=True)
df.isna().sum()

label     0
review    0
dtype: int64

In [None]:
df['label'].value_counts()

In [21]:
#transform labels to numeric values
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['label_num'] = le.fit_transform(df['label'])
df.head()

Unnamed: 0,label,review,label_num
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,3
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",3
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,3
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",3
4,Household,Incredible Gifts India Wooden Happy Birthday U...,3


In [38]:
#becuse our dataset is big I'm going to select 3000 records from each label to train and test my models
df_sample = df.groupby('label').head(3000)
df_sample

Unnamed: 0,label,review,label_num
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,3
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",3
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,3
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",3
4,Household,Incredible Gifts India Wooden Happy Birthday U...,3
...,...,...,...
42799,Electronics,Enter Standard USB Keyboard E-KB503U SPECIFICA...,2
42800,Electronics,The Clownfish Vegan 8 Liter Laptop Briefcase (...,2
42801,Electronics,Da Milano Leather Black Laptop Bag Since incep...,2
42802,Electronics,Killer Derby 38L Large Laptop Backpack With 2 ...,2


In [22]:
classes_dict = {le.classes_[i] : i for i in range(len(le.classes_))}
classes_dict

{'Books': 0, 'Clothing & Accessories': 1, 'Electronics': 2, 'Household': 3}

In [40]:
from sklearn.model_selection import train_test_split

#split our dataset to text and train datasets
X_train, X_test, y_train, y_test = train_test_split(df_sample['review'], df_sample['label_num'], test_size=0.2,random_state=26)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((9600,), (9600,), (2400,), (2400,))

In [31]:
import spacy
from spacy.lang.en import STOP_WORDS
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [32]:
#define a function for filter stop words and punctuations and extract lemma from the texts
nlp = spacy.load("en_core_web_sm")

def preprocesser(text_array):
    preprocessed_texts = []
    docs = list(nlp.pipe(text_array, disable=["parser", "ner"], n_process=-1)) #process the texts in parallel using all available CPU cores
    for doc in docs:
        words_lst = []
        for token in doc:
            if token.is_stop or token.is_punct:
                continue
            lemma = token.lemma_.lower().strip()
            if lemma:
                words_lst.append(lemma)
        preprocessed_text = " ".join(words_lst)
        preprocessed_texts.append(preprocessed_text)
    return preprocessed_texts


In [34]:
text_processer = FunctionTransformer(preprocesser)

I'm going to train 4 models using TfidfVectorizer and CountVectorizer as vectorizer and MultinomialNB and RandomForestClassifier as classifier

In [42]:
cls1 = make_pipeline(
    text_processer,
    TfidfVectorizer(),
    MultinomialNB()
)

cls1.fit(X_train,y_train)

In [43]:
print(classification_report(y_test,cls1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95       619
           1       0.97      0.99      0.98       595
           2       0.97      0.97      0.97       600
           3       0.93      0.97      0.95       586

    accuracy                           0.96      2400
   macro avg       0.96      0.96      0.96      2400
weighted avg       0.96      0.96      0.96      2400



In [44]:
cls2 = make_pipeline(
    text_processer,
    CountVectorizer(),
    MultinomialNB()
)

cls2.fit(X_train,y_train)

In [45]:
print(classification_report(y_test,cls2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95       619
           1       0.96      0.99      0.98       595
           2       0.97      0.97      0.97       600
           3       0.94      0.97      0.95       586

    accuracy                           0.96      2400
   macro avg       0.96      0.96      0.96      2400
weighted avg       0.96      0.96      0.96      2400



In [46]:
cls3 = make_pipeline(
    text_processer,
    TfidfVectorizer(),
    RandomForestClassifier()
)

cls3.fit(X_train,y_train)

In [47]:
print(classification_report(y_test,cls3.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       619
           1       0.99      1.00      0.99       595
           2       0.98      0.98      0.98       600
           3       0.97      0.97      0.97       586

    accuracy                           0.98      2400
   macro avg       0.98      0.98      0.98      2400
weighted avg       0.98      0.98      0.98      2400



In [49]:
cls4 = make_pipeline(
    text_processer,
    CountVectorizer(),
    RandomForestClassifier()
)

cls4.fit(X_train,y_train)

In [50]:
print(classification_report(y_test,cls4.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97       619
           1       0.99      1.00      0.99       595
           2       0.98      0.98      0.98       600
           3       0.97      0.97      0.97       586

    accuracy                           0.98      2400
   macro avg       0.98      0.98      0.98      2400
weighted avg       0.98      0.98      0.98      2400



we could see we get better result with RandomForestClassifier and there is no significant diffrent between CountVectorizer and TfidfVectorizer