# Product classifier based on its characteristics

Importing the necessary packages and dataset

In [1]:
import os
import pandas as pd
import numpy as np
import math
import time
from scipy import stats
import pickle

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
DATASET_PATH = os.environ['DATASET_PATH']
METRICS_PATH = os.environ['METRICS_PATH']
MODEL_PATH = os.environ['MODEL_PATH']

df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


In [3]:
print(f'Original size of dataframe: {df.shape}')
df = df.dropna(subset=['price', 'weight', 'minimum_quantity', 'category', 'view_counts'])
print(f'Size after dropping NaNs: {df.shape}')



Original size of dataframe: (38000, 15)
Size after dropping NaNs: (37942, 15)


The exploratory analysis showed a very large dispersion of product prices. Cases with extremely high prices may be outliers or items with incorrect input data and are thus dropped. The criterion to be an outlier was the usual : 3 standard deviations. 

In [4]:
outlier_filter = (abs(stats.zscore(df.price)<3))
df = df[outlier_filter]
print(f'Dataframe size after dropping outliers: {df.shape}')


Dataframe size after dropping outliers: (37578, 15)


# Numerical features

The exploratory analysis showed that numerical features such as express delivery, search page and position would show no correlation with different categories, so only 4 variables were considered. Preliminary tests also showed that the most relevant among these four is the price.

Originally, the variables can assume values between 0 and thousands, which can be adjusted with a scaler.

In [5]:
indexes = [df.columns.get_loc(col) for col in ['price', 'weight','minimum_quantity', 'view_counts']]
values = df.values[:,indexes]

In [6]:
scaler = StandardScaler().fit(values)
values_scaled = scaler.transform(values)

# Text features

Both Title and the concatenated tags were considered as feature candidates. To avoid an extreme usage of memory, it was necessary to select one or the other (but not both). Among the tested classifiers, the F1 score was usually better if the tags were considered instead of the titles. 

In [7]:
texts = np.array(df['concatenated_tags'].tolist())
texts = texts.reshape(-1, 1)

oh_encoder = OneHotEncoder(sparse=False)
texts_encoded = oh_encoder.fit_transform(texts)
    

In [8]:
X_encoded = np.concatenate((values_scaled, texts_encoded), axis=1)

# Classifier categories

A label encoder was used to convert the categories from text to numerical values. More sophisticated strategies were not tested and this is a point where the model could be improved.

The vector of class labels must be reshaped due to the way SGDClassifier works.

In [9]:
y = df.values[:,-1]
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y.ravel())
n_dataset = df.shape[0]
y_encoded.reshape(n_dataset,)
y_encoded = y_encoded.ravel()

# Train test split
There is a fair amount of data points, so the fraction of the test dataset does not need to be extremely small. The split is random because there is no pattern in data (it is not a time series) but the random state is constant to have reproductible results. 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.33, random_state=27)
print(X_train.shape)
print(X_test.shape)

(25177, 22545)
(12401, 22545)


# Model training

Three linear classifiers were evaluated: SVM, logistic regression and one with Huber loss. Empirically, the Huber classifier performed better, probably due to the high dispersion of values in the numerical features. Training time was similar among these classifiers. However, a classifier based on a squared hinge loss function was also considered in a preliminary evaluation but did not finish in a reasonable amount of time (> 40 min)

In [11]:
initial_time = time.time()
model = SGDClassifier(loss='modified_huber')
model.fit(X_train, y_train)

training_time = (time.time()-initial_time)/60
print(f"Training time: {training_time:.2f} min")

Training time: 4.21 min


In [12]:
y_train_hat = model.predict(X_train)
self_accuracy = accuracy_score(y_train, y_train_hat)

print(f'Accuracy in self-validation: {100*self_accuracy:.2f} %')



Accuracy in self-validation: 96.25 %


In [13]:
y_test_hat = model.predict(X_test)
cross_accuracy = accuracy_score(y_test, y_test_hat)

print(f'Accuracy in cross-validation: {100*cross_accuracy:.2f} %')


Accuracy in cross-validation: 72.35 %


# Calculating and writing metrics

In [14]:
categories = df.category.drop_duplicates().tolist()
precision_average, recall_average, f1_average, _ = precision_recall_fscore_support(y_test, y_test_hat, average='weighted')
precision_cat, recall_cat, f1_cat, support_cat = precision_recall_fscore_support(y_test, y_test_hat, average=None, labels=list(range(len(categories))))

In [15]:
precision_cat_writing = dict(zip(categories, precision_cat))
recall_cat_writing = dict(zip(categories, recall_cat))
f1_cat_writing = dict(zip(categories, f1_cat))
support_cat_writing = dict(zip(categories, support_cat))

In [16]:
file = open(METRICS_PATH, "w")
file.write(f"Results for Huber loss with {len(y_train)} training data points.\n")
file.write(f"Training time: {training_time:.1f} min.\n")
file.write(f"Average precision: {100*precision_average:.2f}%.\n")
file.write(f"Average recall: {100*recall_average:.2f}%.\n")
file.write(f"Average F1: {100*f1_average:.2f}%.\n")
file.write("\nPrecision (%): \n")
file.write(str([f"{c}: {100*precision_cat_writing[c]:.2f}" for c in categories]))
file.write("\nRecall (%): \n")
file.write(str([f"{c}: {100*recall_cat_writing[c]:.2f}" for c in categories]))
file.write("\nF1 (%): \n")
file.write(str([f"{c}: {100*f1_cat_writing[c]:.2f}" for c in categories]))
file.write("\nNumber of elements per category: \n")
file.write(str([f"{c}: {support_cat_writing[c]}" for c in categories]))
file.close()

# Final model with all data

In [None]:
model.fit(X_encoded, y_encoded)

# Saving the final model to disk

In [17]:
pickle.dump(model, open(MODEL_PATH, 'wb'))