In [1]:
# Importing general Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
import re

In [2]:
# nltk 
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer, PorterStemmer

Dataset Link: [E-commerce dataset Kaggle](https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification)

In [3]:
# importing dataset
dataset = pd.read_csv('ecommerceDataset.csv')

In [4]:
# taking a look at data
dataset.head()

Unnamed: 0,Household,"Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal bliss.so bring home this elegant print that is lushed with rich colors that makes it nothing but sheer elegance to be to your friends and family.it would be treasured forever by whoever your lucky recipient is. Liven up your place with these intriguing paintings that are high definition hd graphic digital prints for home, office or any room."
0,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
1,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
2,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
3,Household,Incredible Gifts India Wooden Happy Birthday U...
4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...


In [5]:
# Setting the column headings

dataset.columns = ['label', 'text']

In [6]:
dataset.head()

Unnamed: 0,label,text
0,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
1,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
2,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
3,Household,Incredible Gifts India Wooden Happy Birthday U...
4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...


In [7]:
# Getting some insights of the data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50424 entries, 0 to 50423
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   50424 non-null  object
 1   text    50423 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [8]:
dataset['label'].value_counts()

Household                 19312
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: label, dtype: int64

In [9]:
# there could be a problem of class imbalance because the household products are much more compared to others.
# we will use stratified shuffle split when splitting the data.
# Also let's encode the labels.
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
dataset['label_encoded'] = labelencoder.fit_transform(dataset['label'])

In [10]:
dataset[['label','label_encoded']].value_counts()

label                   label_encoded
Household               3                19312
Books                   0                11820
Electronics             2                10621
Clothing & Accessories  1                 8671
dtype: int64

In [11]:
# storing the label and text in y and X.
X = dataset['text']
y = dataset['label_encoded']

In [12]:
# Splitting the dataset using stratified shuffle splitting
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)
sss.get_n_splits(X, y)

1

In [13]:
for train_index, test_index in sss.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index] 

Preprocessing functions

In [14]:
# We will write some preprocessing functions for the text column and later will apply these functions using pd.apply()

# function to convert the text to lower case

def convert_to_lowercase(text):
    return text.str.lower()

# function to remove punctuations from the text
def remove_punctuations(text):
    eng_punctuation = string.punctuation
    translator = str.maketrans('','', eng_punctuation)
    return str(text).translate(translator)

# function to remove stopwords from the text
def remove_stopwords(text):
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    return " ".join([word for word in str(text).split() if word not in stopwords])

# function to remove repeating characters
def remove_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)

# function to remove numeric text
def remove_numeric(text):
    return re.sub('[0-9]+', '', text)

# Tokenizing the text
def tokenize_text(text):
    tokenizer = RegexpTokenizer('\w+')
    text = text.apply(tokenizer.tokenize)
    return text

# lemmatizing the text. i.e, Converting some of the words to their root form. 
def text_lematization(text):
    lm = WordNetLemmatizer()
    text = [lm.lemmatize(word) for word in text]
    return text 

In [15]:
# creating a preprocess function. this function will help in both training the data and also when we will predict a 
# single example from the trained model. i.e, inferencing.

def preprocess(text):
    text = convert_to_lowercase(text)
    text = text.apply(lambda x : remove_punctuations(x))
    text = text.apply(lambda x : remove_stopwords(x))
    text = text.apply(lambda x : remove_repeating_characters(x))
    text = text.apply(lambda x : remove_numeric(x))
    text = tokenize_text(text)
    text = text.apply(lambda x : text_lematization(x))
    text = text.apply(lambda x: " ".join(x))
    return text

In [16]:
# let's preprocess the training data
X_train = preprocess(X_train)

In [17]:
X_train

26636                         trump think like billionaire
37698    newmom maternity legging seamless tummy suppor...
22143         ugc netjrf exam solved paper psychology book
12302    masoom nx heavy base clear shot glass vodka te...
14098    nextgeek ac steam generator iron power cable p...
                               ...                        
4694     polyworm bernini wall oval mirror pwrmlargelav...
30728    feeling good new mood therapy review book read...
17953    blue heaven studio primer g makeup fixer mist ...
9608     premium quality heavy tadka panfrying panwagha...
25043    everything kid learning french book fun exerci...
Name: text, Length: 30254, dtype: object

In [18]:
# Transforming the data using tf-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectoriser = TfidfVectorizer(ngram_range= (1,2), max_features= 300000)
vectoriser.fit(X_train)

TfidfVectorizer(max_features=300000, ngram_range=(1, 2))

In [19]:
print(" No. of Feature words: ", len(vectoriser.get_feature_names()))

 No. of Feature words:  300000


In [20]:
# Transforming The training data using the vectoriser
X_train = vectoriser.transform(X_train)

In [21]:
# Model
# We will try twoo different models one random forest classifier and XGboost classifier

from random import Random
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

random_forest_classifier = RandomForestClassifier(n_estimators= 10, criterion= 'entropy', random_state= 42)
random_forest_classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=42)

In [22]:
# XGBoost model
xgb_boost_classifier = XGBClassifier()
xgb_boost_classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [23]:
# Now we will tes the two models on the test set with accuracy_score metric.
from sklearn.metrics import accuracy_score

X_test = preprocess(X_test) # we have preprocess the test data too
X_test = vectoriser.transform(X_test) #transforming the testing data
random_forest_predictions = random_forest_classifier.predict(X_test)
xgb_predictions = xgb_boost_classifier.predict(X_test)
random_forest_score = accuracy_score(y_test, random_forest_predictions)
xgb_score = accuracy_score(y_test, xgb_predictions)

print("Random Forest Classifier Accuracy: ", random_forest_score * 100 , "%")
print("XGBoost Classifier Accuracy: ", xgb_score * 100 , "%")

Random Forest Classifier Accuracy:  95.25532969757064 %
XGBoost Classifier Accuracy:  95.5825483391175 %


In [24]:
# Both models are giving pretty good accuracies but we will choose xgboost as the final model.
# Now to make prediction of a single example we will write a function.

def predict_custom(inp):
    inp = pd.Series(inp)
    inp = preprocess(inp)
    inp = [inp[0],]
    inp = vectoriser.transform(inp)
    prediction = xgb_boost_classifier.predict(inp)
    prediction = prediction[0]

    interpretations = {
        0 : "Books",
        1 : "Clothing and Accessories",
        2 : "Electronics",
        3 : "Household"
    }

    for i in interpretations.keys():
        if i == prediction:
            return interpretations[i]

In [25]:
# lets try some random examples from the data

# book
inp1 = "A Dance with Dragons: A Song of Ice and Fire: Book Five Review “Filled with vividly rendered set pieces, unexpected turnings, assorted cliffhangers and moments of appalling cruelty, A Dance with Dragons is epic fantasy as it should be written: passionate, compelling, convincingly detailed and thoroughly imagined.”—The Washington Post   “Long live George Martin . . . a literary dervish, enthralled by complicated characters and vivid language, and bursting with the wild vision of the very best tale tellers.”—The New York Times   “One of the best series in the history of fantasy.”—Los Angeles Times  About the Author George R.R. Martin sold his first story in 1971 and has been writing professionally since then. He spent ten years in Hollywood as a writer-producer, working on The Twilight Zone, Beauty and the Beast, and various feature films and television pilots that were never made. In the mid '90s he returned to prose, his first love, and began work on his epic fantasy series, A Song of Ice and Fire. He has been in the Seven Kingdoms ever since. Whenever he's allowed to leave, he returns to Santa Fe, New Mexico, where he lives with the lovely Parris, a big white dog called Mischa, and two cats named Augustus and Caligula, who think they run the place. 				    	 					              See all Product description"
print("prediction inp1: ", predict_custom(inp1))

# Household
inp2 = "KEZRO Men's Wallet (Brown)"
print("prediction inp1: ", predict_custom(inp2))

prediction inp1:  Books
prediction inp1:  Household


In [27]:
# Clothing and accessories
inp3= "Mushkiya ABD-026 C-Cut Double Layer Abaya burkha for women girl About Mushkiya Mushkiya is an international islamic clothing brand that represents style and modest clothing since 2006. Founded in Dubai , Mushkiya works on a basis of creativity and responsibility, fused with a modesty of islamic clothing.Fashion with modesty are at the heart of the brand’s philosophy. Product Description Simple yet trendy Abaya Dress in double layer and frills on shoulders would become your favourite in no time. This whimsical 'C-Cut' Abaya Dress Is made of Nida-Matt fabric which is soft, breathable and has a great fall. Pinping on the front layer, enhances the look of the Abaya. This Abaya can also be used as a Dress or a Burqa if you prefer colorful ones."
print("prediction inp1: ", predict_custom(inp3))

# Electronics
inp4 = "Sony DVD Printable Blanks with White Surface -Jar Pack of 100 This product is a must have for many around the world. As it gives it's users the experience they truly desire for."
print("prediction inp1: ", predict_custom(inp4))

prediction inp1:  Clothing and Accessories
prediction inp1:  Electronics


In [26]:
# it looks like it is working fine.
# We can do even more with this model. Using pickle module we can dump the model and we can make a api to use in webapp.
# we can also dump the the trained vectoriser so that we don't have to wait again.