### Import Libraries

In [159]:
import numpy as np
import pandas as pd
import pickle
from sklearn.naive_bayes import GaussianNB
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.util import ngrams
import string
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split


### Load and explore the data (4 marks)

In [160]:
df = pd.read_csv('product-cat-dataset.csv')

In [161]:
#View the first lines of dataset to check the contents

df.head()

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,D06E
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,98CF


In [162]:
#Obtain information about the different columns in the dataset including

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10649 entries, 0 to 10648
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  10637 non-null  object
 1   Level_1      10649 non-null  object
 2   Level_2      10649 non-null  object
 3   Level_3      10649 non-null  object
dtypes: object(4)
memory usage: 332.9+ KB


In [163]:
df.describe()

Unnamed: 0,Description,Level_1,Level_2,Level_3
count,10637,10649,10649,10649
unique,9677,15,39,43
top,glory gorg col fing complet outfit express moo...,B092BA29,2D5A3,28A7
freq,24,900,797,797


In [164]:
df.agg(['count', 'size', 'nunique'])

Unnamed: 0,Description,Level_1,Level_2,Level_3
count,10637,10649,10649,10649
size,10649,10649,10649,10649
nunique,9677,15,39,43


### Deal with Missing Data (4 marks)

In [165]:
# Check if data has missing values in the Description column
#Check column distribution of null values and their proportions

Column_Info= pd.DataFrame(df.dtypes).T.rename(index={0:'column type'})
Column_Info= Column_Info.append(pd.DataFrame(df.isnull().sum()).T.rename(index={0:'null values (nb)'}))
Column_Info= Column_Info.append(pd.DataFrame(df.isnull().sum()/df.shape[0]*100).T.
                                rename(index={0:'null values (%)'}))
display(Column_Info)

Unnamed: 0,Description,Level_1,Level_2,Level_3
column type,object,object,object,object
null values (nb),12,0,0,0
null values (%),0.112687,0.0,0.0,0.0


In [166]:
# Deal with missing values
#Remove Null Values
df.dropna(axis = 0, inplace = True)

#Check null values in dataset and technical info on the columns
df.isnull().any()


Description    False
Level_1        False
Level_2        False
Level_3        False
dtype: bool

### Drop Classes where the number of instances is < 10 (4 marks)

In [167]:
def delete_instances(data, level):
    level_data = data[[level, 'Description']]
    level_grouped =  level_data.groupby([level]).size().reset_index(name='counts')
    instances_to_delete = level_grouped[level_grouped['counts'] < 10][level].tolist()
    print(instances_to_delete)
    data.drop(data[data[level].isin(instances_to_delete)].index, inplace=True)
    return data


In [168]:
# Apply to Level_1
df = delete_instances(df, 'Level_1')

[]


In [169]:
# Apply to Level_2
df = delete_instances(df, 'Level_2')

['80D5B', 'A6301', 'C66C5']


In [170]:
# Apply to Level_3
df = delete_instances(df, 'Level_3')


['CF52', 'DE3D']


### Now let's write a Function to Prepare Text (4 marks)
We will apply it to our DataFrame later on

* This function receives a text string and performs the following:
* Convert text to lower case
* Remove punctuation marks
* Apply stemming using the popular Snowball or Porter Stemmer (optional)
* Apply NGram Tokenisation
* Return the tokenised text as a list of strings

In [171]:


#Download packages for nltk
#nltk.download()


def process_text(text, n = 1):
    """
    Takes in a string of text, then performs the following:
    1. Convert text to lower case and remove all punctuation
    2. Optionally apply stemming
    3. Apply Ngram Tokenisation
    4. Returns the tokenised text as a list
    """
    # write steps here

    #Explicitly calling lower here to comply with instructions, however lower is being called anyway by PorterStemmer
    text = text.lower()
    #Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #Tokenise
    tokens = word_tokenize(text)
    #Apply stemming
    ss = SnowballStemmer(language='english')
    stemmed_text = [ss.stem(word) for word in tokens]
    #Apply ngram
    result = [ ' '.join(grams) for grams in ngrams(stemmed_text, n)]
    
    return result






In [172]:
# Here is an example function call
process_text("Here we're testing the process_text function, results are as follows:", n = 3)

['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

In [173]:
# Results should look like this:
['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

### Now let's apply TF-IDF to extract features from plain text (10 marks)

In [174]:
# Might take a while...
# Here you apply the process_text function to the Description column of the data
# Then you pass the results to the bag of words tranformer
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

from sklearn.feature_extraction.text import CountVectorizer
#df2['Ngrams'] = df2['Description'].apply(lambda x: process_text(x))
cv = CountVectorizer(tokenizer=process_text)
X = cv.fit_transform(df['Description'].tolist())


In [175]:
X.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Now we can use .transform on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of text file contents. Let's go ahead and check out how the bag-of-words counts for the entire corpus in a large, sparse matrix:

In [176]:
# After that you pass the result of the previous step to sklearn's TfidfTransformer
# which will convert them into a feature matrix
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html



tfid = TfidfTransformer()
text_tfidf = tfid.fit_transform(X)
    


In [177]:
# The resulting matrix is in sparse format, we can transform it into dense
# Code prepared for you so you can see what results look like
text_tfidf = pd.DataFrame(text_tfidf.toarray())

In [178]:
# This is an example result, the matrix will contain lots of zero values, that is expected
# Some values will be non-zero
#text_tfidf.head()
print(text_tfidf.shape)
print(df.shape)
print(X.shape)

(10627, 16320)
(10627, 4)
(10627, 16320)


# Now the Data is Ready for Classifier Usage

### Split Data into Train and Test sets (4 marks)

In [179]:
# Train/Test split


combined = text_tfidf.copy()
combined['Level_1'] = df['Level_1'].astype(str).values
combined['Level_2'] = df['Level_2'].astype(str).values
combined['Level_3'] = df['Level_3'].astype(str).values


train_data, test_data = train_test_split(combined, test_size=0.2, random_state=25)


In [180]:
print(combined.shape)
print(df.shape)

(10627, 16323)
(10627, 4)


In [181]:
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16313,16314,16315,16316,16317,16318,16319,Level_1,Level_2,Level_3
1572,0.04961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69286F45,2D5A3,28A7
1731,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69286F45,2D5A3,28A7
1712,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69286F45,2D5A3,28A7
4554,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,EFEF723B,CB803,627D
4705,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,EFEF723B,CB803,627D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8447,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B092BA29,5A8AB,AA6B
2934,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,AAC8EE56,9B69F,80C4
10383,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4C3D8686,223B2,F213
6618,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,014303D1,77F62,5AE1


In [182]:
test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16313,16314,16315,16316,16317,16318,16319,Level_1,Level_2,Level_3
3472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,09BF5150,F824F,7288
5681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4513C920,E69F5,DDD5
6364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57164AC1,94728,5912
9073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B092BA29,375FE,1F61
4351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,EFEF723B,CB803,627D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90A8B052,C719A,A0E2
6080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35E04739,390F1,6856
8580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B092BA29,5A8AB,AA6B
5532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4513C920,F4055,7C00


In [183]:
X_train = train_data.iloc[:,:-3]
X_test = test_data.iloc[:,:-3]
y_train = train_data.iloc[:,-3:]
y_test = test_data.iloc[:,-3:]

In [184]:
# You might need to reset index in each dataframe (depends on you how you do things)
# done for you to make it clearer
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [185]:
y_train

Unnamed: 0,Level_1,Level_2,Level_3
0,69286F45,2D5A3,28A7
1,69286F45,2D5A3,28A7
2,69286F45,2D5A3,28A7
3,EFEF723B,CB803,627D
4,EFEF723B,CB803,627D
...,...,...,...
8496,B092BA29,5A8AB,AA6B
8497,AAC8EE56,9B69F,80C4
8498,4C3D8686,223B2,F213
8499,014303D1,77F62,5AE1


In [186]:
# You might need to take classes as separate columns (depends on you how you do things)
class1_train = y_train['Level_1'].astype(str)
class1_test = y_test['Level_1'].astype(str)
class2_train = y_train['Level_2'].astype(str)
class2_test = y_test['Level_2'].astype(str)
class3_train = y_train['Level_3'].astype(str)
class3_test = y_test['Level_3'].astype(str)


## Model training for the three levels (8 marks)

In [187]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16310,16311,16312,16313,16314,16315,16316,16317,16318,16319
0,0.04961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8496,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8497,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8498,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8499,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [188]:

# Create and save model for level 1
model = GaussianNB()
model.fit(X_train, class1_train)
print(model.score(X_test, class1_test))
with open('level1.pk', mode='wb') as l1:
    pickle.dump(model, l1)


0.7365945437441204


In [189]:
def process_levels(classes, level, y, suffix = ''):
    for level_class in classes:
        rows = y_train.loc[y_train[level]== level_class].index.values
        train_X = X_train.iloc[rows]
        train_y = y.iloc[rows]
        m = GaussianNB()
        m.fit(train_X, train_y)
        with open(f'{level_class}_{suffix}.pk', mode='wb') as lm:
            pickle.dump(m, lm)

In [190]:
## Create and save models for level 2
process_levels(class1_train.unique(), 'Level_1', class2_train, 'level2')

In [191]:
## Create and save models for level 3
process_levels(class2_train.unique(), 'Level_2', class3_train, 'level3')

## Predict the test set (8 marks)

In [192]:
# Creating an empty Dataframe with column names only (depends on you how you do things)
results = pd.DataFrame(columns=['Level1_Pred', 'Level2_Pred', 'Level3_Pred'])

## Here we reload the saved models and use them to predict the levels
# load model for level 1 (done for you)
level1 = []
level2 = []
level3 = []
with open('level1.pk', 'rb') as nb:
     model = pickle.load(nb)
     for row in X_test.to_numpy():
        sample = row.reshape(1,-1)
        predicted_level1 = model.predict(sample)[0]
        level1.append(predicted_level1)
        with open(f'{predicted_level1}_level2.pk', 'rb') as l2:
            model2 = pickle.load(l2)
            predicted_level2 = model2.predict(sample)[0]
            level2.append(predicted_level2)
            with open(f'{predicted_level2}_level3.pk', 'rb') as l3:
                model3 = pickle.load(l3)
                predicted_level3 = model3.predict(sample)[0]
                level3.append(predicted_level3)

results['Level1_Pred'] = level1
results['Level2_Pred'] = level2
results['Level3_Pred'] = level3

## loop through the test data, predict level 1, then based on that predict level 2
## and based on level 2 predict level 3 (you need to load saved models accordingly)

    

In [193]:
## After you add the predictions to the results dataframe
## they should look like this
results

Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,57164AC1,94728,5912
1,35E04739,390F1,6856
2,57164AC1,94728,5912
3,B092BA29,375FE,1F61
4,57164AC1,94728,5912
...,...,...,...
2121,B092BA29,5A8AB,AA6B
2122,35E04739,390F1,6856
2123,B092BA29,5A8AB,AA6B
2124,2CEC27F1,ADAD6,98CF


## Compute Accuracy on each level (4 marks)
Now you have the predictions for each level (in the test data), and you also have the actual levels, you can compute the accurcay

In [194]:
from sklearn.metrics import accuracy_score
# Level 1 accuracy
accuracy_score(class1_test, results['Level1_Pred'])

0.7365945437441204

In [195]:
# Level 2 accuracy
accuracy_score(class2_test, results['Level2_Pred'])

0.6495766698024459

In [196]:
# Level 3 accuracy
accuracy_score(class3_test, results['Level3_Pred'])

0.6392285983066792

## Well done!