In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, classification_report

import pickle
from joblib import dump, load
from google.colab import drive
import warnings
import sys

In [2]:
#basic commands: mount google drive and prevent unnecessary printing clutter
drive.mount('/content/drive', force_remount=True) #mount the drive
warnings.filterwarnings('ignore') #so that we do not see code warnings being printed

folder_path = '/content/drive/MyDrive/MDM/'
sys.path.append(folder_path)
df = pd.read_excel(folder_path+"ProductData500Items.xlsx")

Mounted at /content/drive


In [3]:
X = df['Long Description']
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((375,), (125,), (375,), (125,))

**Logistic Regression**

In [5]:
model_log = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                      ('tfidf', TfidfTransformer()),
                      ('model',LogisticRegression()),
                     ])

model_log.fit(X_train, y_train)

ytest = np.array(y_test)
pred = model_log.predict(X_test)

In [6]:
print('accuracy %s' % accuracy_score(pred, y_test))
print(classification_report(ytest, pred))

accuracy 1.0
                                         precision    recall  f1-score   support

 Apparel//Children//Fancy dress costume       1.00      1.00      1.00        18
            Apparel//Children//Uniforms       1.00      1.00      1.00         7
              Apparel//Men//Casual Wear       1.00      1.00      1.00        14
              Apparel//Men//Ethnic Wear       1.00      1.00      1.00        18
                  Apparel//Men//Trouser       1.00      1.00      1.00        12
Apparel//Men// Track Suit /Jogging Suit       1.00      1.00      1.00        16
            Apparel//Women//Formal wear       1.00      1.00      1.00        10
      Apparel//Women//Maternity clothes       1.00      1.00      1.00        11
                   Apparel//Women//Tops       1.00      1.00      1.00        11
           Apparel//Women// Sports wear       1.00      1.00      1.00         8

                               accuracy                           1.00       125
             

In [7]:
X_test[:10]

361    Tiffany & Co- Men's Regular Fit Causal Trouser...
73     Polyester & Cotton Machine Wash- Men_Navy Blue...
374    Anta- Men's Regular Fit Causal Trouser -  Thes...
155    Violet- Striped Fit and Flare Pleated MAMA Bef...
104    Green- Green Tshirt for women Brand logo print...
394    Loewe- Men's Regular Fit Causal Trouser -  The...
377    Moncler- Men's Regular Fit Causal Trouser -  T...
124    Acerbis- WOMEN SOLID SLIM-FIT TRAINING JOGGERS...
68     Polyester & Cotton Machine Wash- Men_Magenta S...
450    Acerbis sportswear- This tracksuit consists of...
Name: Long Description, dtype: object

In [8]:
X_test.head(10)

361    Tiffany & Co- Men's Regular Fit Causal Trouser...
73     Polyester & Cotton Machine Wash- Men_Navy Blue...
374    Anta- Men's Regular Fit Causal Trouser -  Thes...
155    Violet- Striped Fit and Flare Pleated MAMA Bef...
104    Green- Green Tshirt for women Brand logo print...
394    Loewe- Men's Regular Fit Causal Trouser -  The...
377    Moncler- Men's Regular Fit Causal Trouser -  T...
124    Acerbis- WOMEN SOLID SLIM-FIT TRAINING JOGGERS...
68     Polyester & Cotton Machine Wash- Men_Magenta S...
450    Acerbis sportswear- This tracksuit consists of...
Name: Long Description, dtype: object

In [9]:
y_test[:10]

361                      Apparel//Men//Trouser
73                   Apparel//Men//Ethnic Wear
374                      Apparel//Men//Trouser
155          Apparel//Women//Maternity clothes
104               Apparel//Women// Sports wear
394                      Apparel//Men//Trouser
377                      Apparel//Men//Trouser
124               Apparel//Women// Sports wear
68                   Apparel//Men//Ethnic Wear
450    Apparel//Men// Track Suit /Jogging Suit
Name: Category, dtype: object

In [10]:
prediction = model_log.predict(X_test[:10])

In [11]:
print(prediction)

['Apparel//Men//Trouser' 'Apparel//Men//Ethnic Wear'
 'Apparel//Men//Trouser' 'Apparel//Women//Maternity clothes'
 'Apparel//Women//\xa0Sports wear' 'Apparel//Men//Trouser'
 'Apparel//Men//Trouser' 'Apparel//Women//\xa0Sports wear'
 'Apparel//Men//Ethnic Wear' 'Apparel//Men//\xa0Track Suit /Jogging Suit']


In [12]:
df_prediction = pd.read_excel(folder_path+"ProductData100ItemsBlankCategory.xlsx")

In [13]:
df_prediction.head()

Unnamed: 0,Product ID,Product Name,Category,Actual Category,Long Description
0,850664,Louis Vuitton New arrivals casual wears,,Apparel//Men//Casual Wear,Louis Vuitton_Blue and red checked opaque Casu...
1,726313,Chanel New arrivals casual wears,,Apparel//Men//Casual Wear,Chanel_Blue and red checked opaque Casual shir...
2,585034,Gucci New arrivals casual wears,,Apparel//Men//Casual Wear,Gucci_Blue and red checked opaque Casual shirt...
3,585727,Adidas New arrivals casual wears,,Apparel//Men//Casual Wear,Adidas_Blue and red checked opaque Casual shir...
4,833575,Hermès New arrivals casual wears,,Apparel//Men//Casual Wear,Hermès_Blue and red checked opaque Casual shir...


In [14]:
X_prediction = df_prediction['Long Description']
y_prediction = df_prediction['Category']

X_train_prediction, X_test_prediction, y_train_prediction, y_test_prediction = train_test_split(X_prediction, y_prediction, test_size=0.01, random_state=42)

In [15]:
X_train_prediction.describe()

count                                                    99
unique                                                   99
top        These casual  are made from offer superior co...
freq                                                      1
Name: Long Description, dtype: object

In [16]:
X_train_prediction.head(5)

53     These casual  are made from offer superior co...
70    Clergerie- Holiday MEN'S PANTS AND CHINOS ON S...
45    Bosideng_Blue and red checked opaque Casual sh...
44    Loewe_Blue and red checked opaque Casual shirt...
39    Primark / Penney's_Blue and red checked opaque...
Name: Long Description, dtype: object

In [17]:
prediction = model_log.predict(X_train_prediction[:5])
print(prediction)

['Apparel//Men//Trouser' 'Apparel//Women//\xa0Sports wear'
 'Apparel//Men//Casual Wear' 'Apparel//Men//Casual Wear'
 'Apparel//Men//Casual Wear']


**RandomForest**

In [18]:
rf = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
               ('tfidf', TfidfTransformer()),
               ('rf', RandomForestClassifier(n_estimators=50)),
               ])

rf.fit(X_train, y_train)

ytest = np.array(y_test)
preds = rf.predict(X_test)

In [19]:
print('accuracy %s' % accuracy_score(preds, y_test))
print(classification_report(ytest, preds))

accuracy 1.0
                                         precision    recall  f1-score   support

 Apparel//Children//Fancy dress costume       1.00      1.00      1.00        18
            Apparel//Children//Uniforms       1.00      1.00      1.00         7
              Apparel//Men//Casual Wear       1.00      1.00      1.00        14
              Apparel//Men//Ethnic Wear       1.00      1.00      1.00        18
                  Apparel//Men//Trouser       1.00      1.00      1.00        12
Apparel//Men// Track Suit /Jogging Suit       1.00      1.00      1.00        16
            Apparel//Women//Formal wear       1.00      1.00      1.00        10
      Apparel//Women//Maternity clothes       1.00      1.00      1.00        11
                   Apparel//Women//Tops       1.00      1.00      1.00        11
           Apparel//Women// Sports wear       1.00      1.00      1.00         8

                               accuracy                           1.00       125
             

In [20]:
X_train_prediction.head(5)

53     These casual  are made from offer superior co...
70    Clergerie- Holiday MEN'S PANTS AND CHINOS ON S...
45    Bosideng_Blue and red checked opaque Casual sh...
44    Loewe_Blue and red checked opaque Casual shirt...
39    Primark / Penney's_Blue and red checked opaque...
Name: Long Description, dtype: object

In [21]:
prediction = rf.predict(X_train_prediction[:5])
print(prediction)

['Apparel//Men//Trouser' 'Apparel//Men//Ethnic Wear'
 'Apparel//Men//Casual Wear' 'Apparel//Men//Casual Wear'
 'Apparel//Men//Casual Wear']
