### Backlog

\-------------

- Limpar o notebook e descrever o passo a passo;

---

## Data preparation

In [1]:
import os
import re
import time
import nltk
import spacy
import pickle
import string
import unidecode

import numpy as np
import pandas as pd

from dotenv import load_dotenv

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize

In [2]:
_ = load_dotenv()
_ = nltk.download(['rslp', 'punkt'])

[nltk_data] Downloading package rslp to /home/jovyan/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
!python -m spacy download pt_core_news_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting pt-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.2.0/pt_core_news_sm-3.2.0-py3-none-any.whl (22.2 MB)
[K     |████████████████████████████████| 22.2 MB 21.7 MB/s eta 0:00:01
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [4]:
pt_br_stop_words = []
with open('./stop_words.txt') as f:
    pt_br_stop_words = (
        [word if len(word.split(' ')) == 1 else word.split(' ')[1] for word in f.read().split(',')]
    )

    f.close()

In [5]:
nlp = spacy.load('pt_core_news_sm')

In [6]:
DATASET_PATH = os.getenv("DATASET_PATH")
METRICS_PATH = os.getenv("METRICS_PATH")
MODEL_PATH = os.getenv("MODEL_PATH")

In [7]:
print('Dataset Path: ' + DATASET_PATH + '\n' + 'Metrics Path: ' + METRICS_PATH + '\n' + 'Model Path: ' + MODEL_PATH)

Dataset Path: /usr/src/data/sample_products.csv
Metrics Path: /usr/src/data/metrics.txt
Model Path: /usr/src/data/model.pkl


In [8]:
df1 = pd.read_csv(DATASET_PATH)

In [9]:
df1.head()

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38000 entries, 0 to 37999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   product_id         38000 non-null  int64  
 1   seller_id          38000 non-null  int64  
 2   query              38000 non-null  object 
 3   search_page        38000 non-null  int64  
 4   position           38000 non-null  int64  
 5   title              38000 non-null  object 
 6   concatenated_tags  37998 non-null  object 
 7   creation_date      38000 non-null  object 
 8   price              38000 non-null  float64
 9   weight             37942 non-null  float64
 10  express_delivery   38000 non-null  int64  
 11  minimum_quantity   38000 non-null  int64  
 12  view_counts        38000 non-null  int64  
 13  order_counts       17895 non-null  float64
 14  category           38000 non-null  object 
dtypes: float64(3), int64(7), object(5)
memory usage: 4.3+ MB


In [11]:
df1['category'].unique()

array(['Decoração', 'Papel e Cia', 'Outros', 'Bebê', 'Lembrancinhas',
       'Bijuterias e Jóias'], dtype=object)

In [12]:
df2 = df1.copy()

In [13]:
df2['seq'] = df2['query'] + ' ' + df2['title'] + ' ' + df2['concatenated_tags']
seq_column = df2.pop('seq')
df2.insert(0, 'seq', seq_column)

In [14]:
df2 = df2.drop(labels={'query', 'title', 'concatenated_tags'}, axis='columns')

In [15]:
df2.head()

Unnamed: 0,seq,product_id,seller_id,search_page,position,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,espirito santo Mandala Espírito Santo mandala mdf,11394449,8324141,2,6,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,cartao de visita Cartão de Visita cartao visit...,15534262,6939286,2,0,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
2,expositor de esmaltes Organizador expositor p/...,16153119,9835835,1,38,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,medidas lencol para berco americano Jogo de Le...,15877252,8071206,1,6,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,adesivo box banheiro ADESIVO BOX DE BANHEIRO a...,15917108,7200773,3,38,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


In [16]:
df2.dropna(subset=['seq'], inplace=True)
df2.reset_index(inplace=True, drop=True)

In [17]:
def processText(df_column, stop_words=pt_br_stop_words, lemma_dict=nlp):
    # Removing stop words
    df_column = df_column.apply(
        lambda seq: ' '.join([word for word in seq.split(' ') if word not in pt_br_stop_words])
    )

    # Disable case sensitivity
    df_column = df_column.apply(
        lambda seq: ' '.join([word.lower() for word in seq.split(' ')])
    )

    # Remove numbers
    df_column = df_column.apply(
        lambda seq: ' '.join([re.sub(r'\d+', '', word) for word in seq.split(' ')])
    )

    # Remove punctuation marks
    df_column = df_column.apply(
        lambda seq: ' '.join([
            word.translate(
                str.maketrans('','', string.punctuation)) for word in seq.split(' ')
        ])
    )

    # Remove accent marks
    df_column = df_column.apply(
        lambda seq: ' '.join([unidecode.unidecode(word) for word in seq.split(' ')])
    )

    # Remove duplicates
    df_column = df_column.apply(
        lambda seq: ' '.join(list(set(seq.split(' '))))
    )

    # Lemmatization
    df_column = df_column.apply(
        lambda seq: ' '.join([
            word.lemma_ if word.pos_ == 'VERB' else str(word) for word in lemma_dict(seq) 
        ])
    )

    # Remove single char words
    df_column = df_column.apply(
        lambda seq: ' '.join([
            word for word in seq.split(' ') if len(word) > 1
        ])
    )

    # Stemming
    # stemmer = RSLPStemmer()
    # df_column = df_column.apply(
    #     lambda seq: ' '.join([stemmer.stem(token) for token in word_tokenize(seq)])
    # )

    return df_column

In [18]:
df2['seq'] = processText(df2['seq'])
df2.rename(columns={'seq': 'seq_process'}, inplace=True)

In [19]:
df2.head()

Unnamed: 0,seq_process,product_id,seller_id,search_page,position,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,santo mdf espirito mandala,11394449,8324141,2,6,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,long drink panfletos tag copos cartao adesivos...,15534262,6939286,2,0,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
2,organizador expositor esmaltes,16153119,9835835,1,38,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,jogo medidas estampar berco menino americano l...,15877252,8071206,1,6,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,box banheiro adesivo de,15917108,7200773,3,38,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


In [20]:
df3 = df2.copy()

le = LabelEncoder()
df3['category'] = le.fit_transform(df3['category'])

In [21]:
df3.head()

Unnamed: 0,seq_process,product_id,seller_id,search_page,position,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,santo mdf espirito mandala,11394449,8324141,2,6,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,2
1,long drink panfletos tag copos cartao adesivos...,15534262,6939286,2,0,2018-04-04 20:55:07,77.67,8.0,1,5,124,,5
2,organizador expositor esmaltes,16153119,9835835,1,38,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,4
3,jogo medidas estampar berco menino americano l...,15877252,8071206,1,6,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,0
4,box banheiro adesivo de,15917108,7200773,3,38,2017-05-09 13:18:38,191.81,507.0,1,6,34,,2


In [22]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37998 entries, 0 to 37997
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seq_process       37998 non-null  object 
 1   product_id        37998 non-null  int64  
 2   seller_id         37998 non-null  int64  
 3   search_page       37998 non-null  int64  
 4   position          37998 non-null  int64  
 5   creation_date     37998 non-null  object 
 6   price             37998 non-null  float64
 7   weight            37940 non-null  float64
 8   express_delivery  37998 non-null  int64  
 9   minimum_quantity  37998 non-null  int64  
 10  view_counts       37998 non-null  int64  
 11  order_counts      17893 non-null  float64
 12  category          37998 non-null  int64  
dtypes: float64(3), int64(8), object(2)
memory usage: 3.8+ MB


In [23]:
# Absolute class count
val_count = df3['category'].value_counts()

val_count.sort_index()

0     6930
1      940
2     8722
3    17524
4     1132
5     2750
Name: category, dtype: int64

In [24]:
# Percentage class count
sum_val_count = val_count.values.sum()

(100*val_count / sum_val_count).sort_index()

0    18.237802
1     2.473814
2    22.953840
3    46.118217
4     2.979104
5     7.237223
Name: category, dtype: float64

In [25]:
sorted_cats = df3['category'].unique()
sorted_cats.sort()
le.inverse_transform(sorted_cats)

array(['Bebê', 'Bijuterias e Jóias', 'Decoração', 'Lembrancinhas',
       'Outros', 'Papel e Cia'], dtype=object)

---

## Train, test, and validation

In [26]:
(X_train, X_test, 
 y_train, y_test) = train_test_split(df3['seq_process'], df3['category'], 
                                     test_size=.35, random_state=42)

In [27]:
print((X_train.shape, y_train.shape), (X_test.shape, y_test.shape))

((24698,), (24698,)) ((13300,), (13300,))


In [28]:
# Tf-idf transform
vectorizer = TfidfVectorizer()

tfidf_X_train = vectorizer.fit_transform(X_train)
tfidf_X_test = vectorizer.transform(X_test)

In [29]:
# Grid Search params
# lr = [float(10) ** n for n in range(-1, 1, 1)]
# estimators = [100 + 50*n for n in range(0, 3)]
# num_depth = [depth for depth in range(1, 4)]

# grid_params_GB = dict(learning_rate=lr, n_estimators=estimators, max_depth=num_depth)

In [30]:
# grid_params_GB

In [31]:
# start_time = time.time()

# clf = GradientBoostingClassifier()
# clf = GridSearchCV(clf, grid_params_XGB, cv=5, scoring='accuracy')
# _ = clf.fit(tfidf_X_train, y_train)

# clf = clf.best_estimator_

# end_time = time.time()

In [32]:
start_time = time.time()

clf = GradientBoostingClassifier(
        n_estimators=1000, learning_rate=0.1,                             
        max_depth=3, random_state=0
    ).fit(tfidf_X_train, y_train)

end_time = time.time()

print(r'Tempo de execução (s): {end_time:.2f}'.format(end_time=end_time-start_time))

Tempo de execução (s): 319.76


In [33]:
print(r'Accuracy (Training data): {perc:.2f}%'.format(perc=100*clf.score(tfidf_X_train, y_train)))

Accuracy (Training data): 97.21%


In [34]:
print(r'Accuracy (Test data): {perc:.2f}%'.format(perc=100*clf.score(tfidf_X_test, y_test)))

Accuracy (Test data): 88.39%


In [35]:
print(classification_report(y_test, clf.predict(tfidf_X_test)))

              precision    recall  f1-score   support

           0       0.90      0.83      0.87      2426
           1       0.91      0.92      0.91       331
           2       0.89      0.90      0.89      3032
           3       0.88      0.94      0.91      6164
           4       0.85      0.63      0.73       385
           5       0.82      0.69      0.75       962

    accuracy                           0.88     13300
   macro avg       0.88      0.82      0.84     13300
weighted avg       0.88      0.88      0.88     13300



In [36]:
confusion_matrix(y_test, clf.predict(tfidf_X_test))

array([[2014,    1,  126,  260,    5,   20],
       [   0,  306,    3,   20,    1,    1],
       [  67,   11, 2715,  193,   17,   29],
       [ 123,    8,  122, 5813,   13,   85],
       [   7,    4,   34,   84,  243,   13],
       [  16,    8,   51,  216,    6,  665]])

In [37]:
# Recording classification metrics (precision, recall, f1 score, accuracy)
with open(METRICS_PATH, 'w+') as f:
    f.write(classification_report(y_test, clf.predict(tfidf_X_test)))

    f.close()

In [38]:
with open(MODEL_PATH, 'wb') as f:
    pickle.dump(clf, f)

In [40]:
with open(MODEL_PATH, 'rb') as f:
    clf = pickle.load(f)

---