# Lazada Product Classification

### Audhi Aprilliant

## 1 Import Modules

In [74]:
import pandas as pd                   # Dataframe manipulation
import numpy as np                    # Mathematics operation
from sklearn.feature_extraction.text import CountVectorizer   # Bag of word
from sklearn.model_selection import StratifiedKFold,learning_curve,cross_val_score
from sklearn.model_selection import train_test_split          # Splitting data
from sklearn.naive_bayes import MultinomialNB                 # Modelling with Multinomial Naive Bayes
# Evaluation metrics
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import precision_score,recall_score,f1_score,classification_report

## 2 Load the Data

In [75]:
data_lazada_clean = pd.read_csv('Datasets/interim/1 Lazada-product after Preprocessing.csv')

In [119]:
print('Dimension of Lazada data:\n{}'.format(data_lazada_clean.shape[0]),
      'rows and {}'.format(data_lazada_clean.shape[1]),'columns')
data_lazada_clean.head()

Dimension of Lazada data:
6735 rows and 2 columns


Unnamed: 0,title,category
0,lazada exclusive infinix smart gb dual camera ...,handphone
1,lazada special edition infinix hot gb triple c...,handphone
2,realme c hp murah mah battery octa core hio p ...,handphone
3,vivo y hp gb gb gb all screen inch mp hio p tr...,handphone
4,realme i hp gb gb gb gb qualcomm snapdragon ai...,handphone


In [77]:
print(data_lazada_clean['category'].value_counts())

makeup       2713
pakaian      2380
handphone    1642
Name: category, dtype: int64


In [78]:
# Subsetting
data_makeup = data_lazada_clean.loc[data_lazada_clean['category'] == 'makeup',:]
data_pakaian = data_lazada_clean.loc[data_lazada_clean['category'] == 'pakaian',:]
data_handphone = data_lazada_clean.loc[data_lazada_clean['category'] == 'handphone',:]
# Counting term frequency each categories
term_freq_makeup = pd.Series(' '.join(data_makeup['title']).split(' ')).value_counts()
term_freq_pakaian = pd.Series(' '.join(data_pakaian['title']).split(' ')).value_counts()
term_freq_handphone = pd.Series(' '.join(data_handphone['title']).split(' ')).value_counts()
# Print
print('--- Category MakeUp ---\n',term_freq_makeup[:5],sep='')
print('--- Category Pakaian ---\n',term_freq_pakaian[:5],sep='')
print('--- Category Handphone ---\n',term_freq_handphone[:5],sep='')

--- Category MakeUp ---
mascara       1429
waterproof     603
maskara        464
matte          321
make           313
dtype: int64
--- Category Pakaian ---
tunik     2973
wanita    1422
baju       756
atas       652
murah      543
dtype: int64
--- Category Handphone ---
gb         1790
garansi     758
resmi       723
ram         504
hp          335
dtype: int64


## 3 Splitting the Data into Training, Testing, and Validation Data

In [79]:
train,validation = train_test_split(data_lazada_clean,test_size=0.2,random_state=123)
test = validation.loc[:,['title']]

In [80]:
print('Dims of training data   : {}'.format(train.shape[0])+' rows and {}'.format(train.shape[1])+' variables')
print('Dims of testing data    : {}'.format(test.shape[0])+' rows and {}'.format(test.shape[1])+' variables')
print('Dims of validation data : {}'.format(validation.shape[0])+
      ' rows and {}'.format(validation.shape[1])+' variables')

Dims of training data   : 5388 rows and 2 variables
Dims of testing data    : 1347 rows and 1 variables
Dims of validation data : 1347 rows and 2 variables


## 4 Creating Bag of Words

In [109]:
Y_train = train['category'].reset_index(drop=True)
X_train = train['title']

In [94]:
# Bag of word
print('Creating the bag of words...')
vectorizer = CountVectorizer(analyzer = 'word',
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = None, 
                             max_features = 5000) 
%time train_data_features = vectorizer.fit_transform(X_train)
train_data_features = train_data_features.toarray()

Creating the bag of words...
CPU times: user 224 ms, sys: 14.5 ms, total: 239 ms
Wall time: 782 ms


In [103]:
pd.DataFrame(train_data_features,columns=vectorizer.get_feature_names()).head()

Unnamed: 0,aa,aajsbolc,ab,abad,abang,abel,abelia,abg,abi,abis,...,zone,zoom,zouhmhgt,zoya,zpro,zs,zskl,zte,zyjdjclu,zyrex
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 5 Data Modelling using 10-fold CV

In [110]:
# Stratified 10-CV
skf = StratifiedKFold(n_splits=10)
accuracy = []
i = 0
model = MultinomialNB()
for train_index,test_index in skf.split(train_data_features,Y_train):
    i+=1
    X_train_split,X_test_split = train_data_features[train_index],train_data_features[test_index]
    y_train_split,y_test_split = Y_train[train_index],Y_train[test_index]
    model.fit(X_train_split,y_train_split.values)
    y_pred = model.predict(X_test_split)
    val =  accuracy_score(y_test_split,y_pred)
    accuracy.append(val)
    print(f'Accuracy in CV - {i}: {val}')
print('Average of accuracy: {}'.format(sum(accuracy)/len(accuracy)))

Accuracy in CV - 1: 0.9944341372912802
Accuracy in CV - 2: 0.9962894248608535
Accuracy in CV - 3: 0.9962894248608535
Accuracy in CV - 4: 0.9925788497217068
Accuracy in CV - 5: 1.0
Accuracy in CV - 6: 0.9981447124304267
Accuracy in CV - 7: 0.9925788497217068
Accuracy in CV - 8: 1.0
Accuracy in CV - 9: 0.9944237918215614
Accuracy in CV - 10: 0.9944237918215614
Average of accuracy: 0.995916298252995


In [113]:
# Data modelling with Naive Bayes
%time nb = model.fit(train_data_features,Y_train)

CPU times: user 1.19 s, sys: 0 ns, total: 1.19 s
Wall time: 1.19 s


## 6 Model Evaluation with Validation Data

In [111]:
Y_val = validation['category'].reset_index(drop=True)
X_val = validation['title']
val_data_features = vectorizer.transform(X_val)
val_data_features = val_data_features.toarray()

In [114]:
val_pred = nb.predict(val_data_features)

In [118]:
acc = accuracy_score(Y_val,val_pred)
f1score = f1_score(Y_val,val_pred,average='weighted')
cm = pd.DataFrame(confusion_matrix(Y_val,val_pred),
                  columns = ['Predicted Handphone','Predicted MakeUp','Predicted Pakaian'],
                  index = ['Actual Handphone','Actual MakeUp','Actual Pakaian'])
print(cm)
print('\n',classification_report(Y_val,val_pred,digits=3))
print('Accuracy Score : ' + str(acc))
print('F1 Score       : ' + str(f1score))

                  Predicted Handphone  Predicted MakeUp  Predicted Pakaian
Actual Handphone                  341                 1                  0
Actual MakeUp                       1               535                  0
Actual Pakaian                      0                 0                469

               precision    recall  f1-score   support

   handphone      0.997     0.997     0.997       342
      makeup      0.998     0.998     0.998       536
     pakaian      1.000     1.000     1.000       469

    accuracy                          0.999      1347
   macro avg      0.998     0.998     0.998      1347
weighted avg      0.999     0.999     0.999      1347

Accuracy Score : 0.9985152190051967
F1 Score       : 0.9985152190051967
