### Import Libraries

In [3]:
import numpy as np 
import pandas as pd
import re
import string
string.punctuation;
from nltk.stem.porter import PorterStemmer
from nltk import ngrams
from nltk.util import ngrams
import nltk

### Load and explore the data (4 marks)

In [4]:
data = pd.read_csv("product-cat-dataset.csv")
data.head()

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,D06E
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,98CF


In [5]:
data.columns

Index(['Description', 'Level_1', 'Level_2', 'Level_3'], dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10649 entries, 0 to 10648
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  10637 non-null  object
 1   Level_1      10649 non-null  object
 2   Level_2      10649 non-null  object
 3   Level_3      10649 non-null  object
dtypes: object(4)
memory usage: 332.9+ KB


### Deal with Missing Data (4 marks)

In [7]:
# Check if data has missing values in the Description column
data.isnull().sum()

Description    12
Level_1         0
Level_2         0
Level_3         0
dtype: int64

In [8]:
# Deal with missing values
data = data.dropna()

### Drop Classes where the number of instances is < 10 (4 marks)

In [9]:
# Apply to Level_1 
pd.value_counts(data['Level_1'],ascending=True)
#data['Level_1'].value_counts().loc[lambda x : x<10]

D410C91A    504
90A8B052    506
014303D1    511
4513C920    558
4C3D8686    574
3E1E0D78    579
96F95EEC    587
69286F45    797
09BF5150    799
EFEF723B    800
2CEC27F1    859
57164AC1    877
AAC8EE56    890
35E04739    896
B092BA29    900
Name: Level_1, dtype: int64

In [10]:
# Apply to Level_2
pd.value_counts(data['Level_2'],ascending=True)
#data['Level_2'].value_counts().loc[lambda x : x<10]

A6301      1
C66C5      1
80D5B      6
0864A     16
08960     24
AF6B9     36
6C6B1     36
915D4     47
262E7     63
F824F     72
31FED     86
D5531     88
E69F5    109
5E038    115
E6162    117
223B2    128
36080    176
77F62    229
02FA0    264
7AED7    282
F4055    363
ADAD6    410
A04D3    411
7B638    420
C7E19    429
94728    440
390F1    441
914A1    443
74974    446
9B69F    447
CB803    448
BAE8A    449
B2DB4    449
375FE    450
5A8AB    450
9D9EE    462
C719A    482
ACD06    504
2D5A3    797
Name: Level_2, dtype: int64

In [11]:
# Apply to Level_3
pd.value_counts(data['Level_3'],ascending=True)
#data['Level_3'].value_counts().loc[lambda x : x<10]

CF52      1
E9F4      1
DE3D      1
DC8D      1
B4A7      6
96B8     16
1000     24
215F     33
A104     36
3AAD     36
A2FA     47
3DD3     53
29B3     63
7288     72
6253     88
DDD5    109
6BE5    115
2E14    117
F213    128
7C00    164
C563    176
1F75    199
5AE1    229
078B    264
6539    282
98CF    410
C5B4    411
0F8B    420
D06E    429
5912    439
6856    441
D97D    443
62E8    446
80C4    447
2ABA    448
627D    448
21DA    449
1F61    450
AA6B    450
05A0    462
A0E2    482
33D1    504
28A7    797
Name: Level_3, dtype: int64

We can see that there were no classes that had instances less than 10

### Now let's write a Function to Prepare Text (4 marks)
We will apply it to our DataFrame later on

* This function receives a text string and performs the following:
* Convert text to lower case
* Remove punctuation marks
* Apply stemming using the popular Snowball or Porter Stemmer (optional)
* Apply NGram Tokenisation
* Return the tokenised text as a list of strings

In [12]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import string

def process_text(text, n=2):
    """
    Takes in a string of text, then performs the following:
    1. Convert text to lower case and remove all punctuation
    2. Optionally apply stemming
    3. Apply Ngram Tokenisation
    4. Returns the tokenised text as a list
    """
    # write steps here
    text = text.lower() #description is already all lower case
    text ="".join([i for i in text if i not in string.punctuation])
    porter = PorterStemmer()
    tokenised = ngrams(text.split(), n)
    tokenised = [' '.join(tups) for tups in tokenised]
    return tokenised


In [13]:
# Here is an example function call
process_text("Here we're testing the process_text function, results are as follows:", n = 3)

['here were testing',
 'were testing the',
 'testing the processtext',
 'the processtext function',
 'processtext function results',
 'function results are',
 'results are as',
 'are as follows']

In [14]:
# Results should look like this:
['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

### Now let's apply TF-IDF to extract features from plain text (10 marks)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
# Here you apply the process_text function to the Description column of the data
data['Description'] = data.Description.apply(process_text)
data['Description'] 

0        [gerb cap, cap help, help keep, keep littl, li...
1        [newborn inf, inf toddl, toddl boy, boy hoody,...
2        [tut ballet, ballet anym, anym leap, leap foxy...
3        [newborn inf, inf toddl, toddl boy, boy hoody,...
4        [easy keep, keep feel, feel warm, warm cozy, c...
                               ...                        
10644    [term 10, 10 issu, issu on, on year, year subs...
10645    [term 12, 12 issu, issu on, on year, year subs...
10646    [term 9, 9 issu, issu on, on year, year subscr...
10647    [term 26, 26 issu, issu on, on year, year subs...
10648    [term 12, 12 issu, issu on, on year, year subs...
Name: Description, Length: 10637, dtype: object

In [16]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object
# Then you pass the results to the bag of words tranformer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['Description'].str.join(' '))
tfidf_matrix
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html for reference

<10637x16495 sparse matrix of type '<class 'numpy.float64'>'
	with 298233 stored elements in Compressed Sparse Row format>

In [17]:
# looking at the matrix shape
tfidf_matrix.shape

(10637, 16495)

Now we can use .transform on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of text file contents. Let's go ahead and check out how the bag-of-words counts for the entire corpus in a large, sparse matrix:

In [18]:
# The resulting matrix is in sparse format, we can transform it into dense
# Code prepared for you so you can see what results look like
tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray())

In [19]:
# This is an example result, the matrix will contain lots of zero values, that is expected
tfidf_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16485,16486,16487,16488,16489,16490,16491,16492,16493,16494
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Now the Data is Ready for Classifier Usage

### Split Data into Train and Test sets (4 marks)

In [33]:
# Train/Test split
from sklearn.model_selection import train_test_split
X = tfidf_matrix
y = data[["Level_1", "Level_2", "Level_3"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


In [34]:
# You might need to reset index in each dataframe (depends on you how you do things)
# done for you to make it clearer
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [35]:
# You might need to take classes as separate columns (depends on you how you do things)
class1 = y_train['Level_1'].astype(str)
class2 = y_train['Level_2'].astype(str)
class3 = y_train['Level_3'].astype(str)

## Model training for the three levels (8 marks)

In [36]:
# Create and save model for level 1
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, class1)

DecisionTreeClassifier(random_state=0)

In [37]:
import pickle
pickle.dump(clf, open("DecisionTreeClass1.sav", 'wb'))

In [38]:
## Create and save models for level 2
clf2 = DecisionTreeClassifier(random_state=0)
clf2.fit(X_train, class2)
pickle.dump(clf2, open("DecisionTreeClass2.sav", 'wb'))

In [39]:
## Create and save models for level 3
clf3 = DecisionTreeClassifier(random_state=0)
clf3.fit(X_train, class3)
pickle.dump(clf3, open("DecisionTreeClass3.sav", 'wb'))

## Predict the test set (8 marks)

In [40]:
# Creating an empty Dataframe with column names only (depends on you how you do things)
results = pd.DataFrame(columns=['Level1_Pred', 'Level2_Pred', 'Level3_Pred'])

## Here we reload the saved models and use them to predict the levels
# load model for level 1 (done for you)
with open('DecisionTreeClass1.sav', 'rb') as nb:
    model = pickle.load(nb)

## loop through the test data, predict level 1, then based on that predict level 2
## and based on level 2 predict level 3 (you need to load saved models accordingly)
print(model.predict(X_test))

with open('DecisionTreeClass2.sav', 'rb') as nb:
    model2 = pickle.load(nb)
print(model2.predict(X_test))

with open('DecisionTreeClass3.sav', 'rb') as nb:
    model3 = pickle.load(nb)
print(model3.predict(X_test))

['4C3D8686' '69286F45' 'EFEF723B' ... '35E04739' '014303D1' '4513C920']
['74974' '2D5A3' 'CB803' ... 'B2DB4' '36080' '375FE']
['62E8' '28A7' '627D' ... '21DA' 'C5B4' 'AA6B']


In [41]:
results["Level1_Pred"] = model.predict(X_test)
results["Level2_Pred"] = model2.predict(X_test)
results["Level3_Pred"] = model3.predict(X_test)

In [42]:
## After you add the predictions to the results dataframe
## they should look like this
results

Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,4C3D8686,74974,62E8
1,69286F45,2D5A3,28A7
2,EFEF723B,CB803,627D
3,69286F45,2D5A3,28A7
4,AAC8EE56,9B69F,80C4
...,...,...,...
2123,09BF5150,C7E19,AA6B
2124,EFEF723B,02FA0,078B
2125,35E04739,B2DB4,21DA
2126,014303D1,36080,C5B4


## Compute Accuracy on each level (4 marks)
Now you have the predictions for each level (in the test data), and you also have the actual levels, you can compute the accurcay

In [43]:
# Level 1 accuracy
from sklearn.metrics import accuracy_score
accuracy_score(results['Level1_Pred'],y_test['Level_1'])

0.8355263157894737

In [44]:
# Level 2 accuracy
accuracy_score(results['Level2_Pred'],y_test['Level_2'])

0.7443609022556391

In [45]:
# Level 3 accuracy
accuracy_score(results['Level3_Pred'],y_test['Level_3'])

0.7359022556390977

We can see that level 2 had less accuracy than level 1 and level 3 had less accuracy than level 2 explained with the less categs availble in the classes

## Well done!