In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder, StandardScaler
import numpy as np
import fasttext.util
from tqdm import tqdm
import spacy
import re
import pickle

In [9]:
final_dataframe = pd.read_csv('df_unificado.csv')
final_dataframe.head()

Unnamed: 0,order_item_id,shipping_limit_date,price,freight_value,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,...,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,payment_sequential,payment_type,payment_installments,payment_value,purchase-delivery difference,estimated-actual delivery difference
0,1,2017-09-19 09:45:35,58.9,13.29,cool_stuff,58.0,598.0,4.0,650.0,28.0,...,,"Perfeito, produto entregue antes do combinado.",2017-09-21 00:00:00,2017-09-22 10:57:03,1,credit_card,2,72.19,7,9
1,1,2017-07-05 02:44:11,55.9,17.96,cool_stuff,58.0,598.0,4.0,650.0,28.0,...,,"lannister como sempre, entregou certinho e den...",2017-07-14 00:00:00,2017-07-17 12:50:07,1,boleto,1,73.86,15,13
2,1,2018-05-23 10:56:25,64.9,18.33,cool_stuff,58.0,598.0,4.0,650.0,28.0,...,super recomendo,carrinho muito bonito,2018-06-05 00:00:00,2018-06-06 21:41:12,1,credit_card,2,83.23,17,3
3,1,2017-08-07 18:55:08,58.9,16.17,cool_stuff,58.0,598.0,4.0,650.0,28.0,...,,,2017-08-10 00:00:00,2017-08-13 03:35:17,1,credit_card,3,75.07,8,16
4,1,2017-08-16 22:05:11,58.9,13.29,cool_stuff,58.0,598.0,4.0,650.0,28.0,...,,,2017-08-25 00:00:00,2017-08-28 00:51:18,1,credit_card,4,72.19,14,8


In [10]:
#Preenchimento de valores nulos com a moda, ou seja o valor mais comum

final_dataframe['product_category_name'].fillna(value=final_dataframe['product_category_name'].mode()[0], inplace=True)
final_dataframe['product_name_lenght'].fillna(value=final_dataframe['product_name_lenght'].mode()[0], inplace=True)
final_dataframe['product_photos_qty'].fillna(value=final_dataframe['product_photos_qty'].mode()[0], inplace=True)
final_dataframe['product_weight_g'].fillna(value=final_dataframe['product_weight_g'].mode()[0], inplace=True)
final_dataframe['product_length_cm'].fillna(value=final_dataframe['product_length_cm'].mode()[0], inplace=True)
final_dataframe['product_height_cm'].fillna(value=final_dataframe['product_height_cm'].mode()[0], inplace=True)
final_dataframe['product_width_cm'].fillna(value=final_dataframe['product_width_cm'].mode()[0], inplace=True)

#Preenchimento de valores nulos com a mediana, ou seja o valor médio 
final_dataframe['product_description_lenght'].fillna(value=final_dataframe['product_description_lenght'].median(), inplace=True)

#Preenchimento de valores nulos com valor arbitrário. Não faz sentido preencher com um valor já existente já que o comentário da revisão é utilizado no algoritmo de NLP (Natural language Processing)
final_dataframe['review_comment_message'].fillna(value='indisponível', inplace=True)

In [11]:
#Transformando o problema de multiclasse num problema de classificação binária
final_dataframe['review_score'] = final_dataframe['review_score'].apply(lambda x: 1 if x > 3 else 0)

In [12]:
#Baseado na dispersão dos dados de preço, foi criada uma coluna para categorizar (discretizar, ou seja, reduzir a cardinalidade)

final_dataframe['price_category'] = final_dataframe['price'].apply(lambda x:'expensive' if x>=139 else ('affordable' if x>=40 and x<139 else 'cheap'))

In [13]:
# Retirando pedidos canceladas
final_dataframe = final_dataframe[final_dataframe['order_status'] != 'canceled']

In [14]:
# Criando coluna subjetiva da diferença de tempo de entrega pelo preço, pois o autor encontrou que contanto que seja entregue na data, o aumento de preço é ok
final_dataframe['purchase_delivery_diff_per_price'] = final_dataframe['purchase-delivery difference']/final_dataframe['price']

In [15]:
#Dropando colunas desnecessárias para o Machine Learning
final_dataframe.drop(['shipping_limit_date','order_purchase_timestamp','order_approved_at','order_delivered_carrier_date','order_delivered_customer_date','order_estimated_delivery_date','customer_id'], axis=1, inplace=True)

In [16]:
#Separando a coluna de resultado, o Y, chamado de labels.
labels = final_dataframe['review_score']
final_dataframe.drop('review_score', axis=1, inplace=True)

In [19]:
# Codificando em 0 e 1 se a revisão está disponível ou não
final_dataframe['review_availability'] = final_dataframe['review_comment_message'].apply(lambda x: 1 if x != 'indisponível' else 0)

In [22]:
#Dividindo em treino e teste, 80% / 20%, Estratificar diz que a mesma proporção entre as classes vai ser mantida
X_train, X_test, y_train, y_test = train_test_split(final_dataframe, labels, stratify=labels, test_size=0.2, random_state=0)

In [23]:
#Code copied from my submission of the course assignment

def train_response(frame):
  f1 = frame[frame.iloc[:,1] == 0]
  f2 = frame[frame.iloc[:,1] == 1]
  global dict_frame, dict_f1, dict_f2
  dict_frame = dict(frame.iloc[:,0].value_counts())
  dict_f1 = dict(f1.iloc[:,0].value_counts())
  dict_f2 = dict(f2.iloc[:,0].value_counts())
  state_0, state_1 = [],[],
  for i in range(len(frame)):
    if frame.iloc[:,1][i] == 0:
      state_0.append(dict_f1.get(frame.iloc[:,0][i],0)/dict_frame[frame.iloc[:,0][i]])
      state_1.append(float(1-state_0[-1]))
    else:
      state_1.append(dict_f2.get(frame.iloc[:,0][i],0)/dict_frame[frame.iloc[:,0][i]])
      state_0.append(float(1-state_1[-1])) 
  df3 = pd.DataFrame({'State_0':state_0, 'State_1':state_1})
  return df3.to_numpy()

def test_response(test):
  t_state_0, t_state_1 = [],[]
  for i in range(len(test)):
    if dict_frame.get(test[i]):
      t_state_0.append(dict_f1.get(test[i],0)/dict_frame.get(test[i]))
      t_state_1.append(dict_f2.get(test[i],0)/dict_frame.get(test[i]))
    else:
      t_state_0.append(0.5)
      t_state_1.append(0.5)
  df4 = pd.DataFrame({'State_0':t_state_0, 'State_1':t_state_1})
  return df4.to_numpy()  

In [24]:
#Utilizando response coding, uma técnica que representa a probabilidade de um valor pertencer a uma classe
X_train_resp_prod_cat = train_response(pd.concat([X_train['product_category_name'], y_train], axis=1).reset_index(drop=True))
X_test_resp_prod_cat = test_response(X_test['product_category_name'].values)

Codificando outras colunas

In [29]:

ohe_order_item = OneHotEncoder()
ohe_order_item.fit(X_train['order_item_id'].values.reshape(-1,1))
X_train_order_item = ohe_order_item.transform(X_train['order_item_id'].values.reshape(-1,1)).toarray()
X_test_order_item = ohe_order_item.transform(X_test['order_item_id'].values.reshape(-1,1)).toarray()

X_train_resp_payment_seq = train_response(pd.concat([X_train['payment_sequential'], y_train], axis=1).reset_index(drop=True))
X_test_resp_payment_seq = test_response(X_test['payment_sequential'].values)

ohe_payment_type = OneHotEncoder()
ohe_payment_type.fit(X_train['payment_type'].values.reshape(-1,1))
X_train_payment_type = ohe_payment_type.transform(X_train['payment_type'].values.reshape(-1,1)).toarray()
X_test_payment_type = ohe_payment_type.transform(X_test['payment_type'].values.reshape(-1,1)).toarray()

enc_price = OrdinalEncoder()
enc_price.fit(X_train['price_category'].values.reshape(-1,1))
enc_price.categories_ = [np.array([ 'cheap', 'affordable', 'expensive'], dtype=object)]
X_train_cat_price = enc_price.transform(X_train['price_category'].values.reshape(-1,1))
X_test_cat_price = enc_price.transform(X_test['price_category'].values.reshape(-1,1))

In [31]:
#Descomentar linha abaixo para baixar o modelo

# fasttext.util.download_model('pt', if_exists='ignore')
ft = fasttext.load_model('./cc.pt.300.bin')



In [32]:
ft.words

[',',
 'de',
 '.',
 'e',
 'a',
 '</s>',
 'o',
 'que',
 'do',
 'da',
 'em',
 ':',
 '"',
 ')',
 '(',
 'para',
 '/',
 'com',
 'um',
 'é',
 'uma',
 'no',
 'não',
 'os',
 'na',
 'por',
 '!',
 'mais',
 '-',
 'se',
 'as',
 'como',
 'O',
 'dos',
 "'",
 'A',
 'ao',
 'ou',
 '?',
 'foi',
 '...',
 'das',
 'sua',
 'ser',
 '|',
 'seu',
 'à',
 ';',
 'mas',
 '_',
 'são',
 'tem',
 'muito',
 '“',
 'sobre',
 '”',
 'pelo',
 'pela',
 'eu',
 'também',
 'nos',
 '–',
 '1',
 'está',
 'você',
 'já',
 '%',
 'ele',
 '$',
 'E',
 'até',
 'pode',
 'mesmo',
 'sem',
 'me',
 'anos',
 'entre',
 '2',
 'bem',
 'quando',
 'dia',
 'isso',
 'Em',
 'ainda',
 'todos',
 'seus',
 '#',
 'às',
 'ter',
 'fazer',
 'R',
 'nas',
 'Os',
 'São',
 'Não',
 '3',
 '10',
 'aos',
 'só',
 'era',
 'tempo',
 'vida',
 'É',
 'onde',
 'Brasil',
 'ela',
 'meu',
 'tudo',
 'sempre',
 'pessoas',
 'suas',
 'aqui',
 'DE',
 'minha',
 'No',
 'vai',
 'vez',
 '*',
 'ano',
 'outros',
 'forma',
 '+',
 'este',
 'assim',
 'parte',
 'apenas',
 'cada',
 'foram',
 

In [35]:
#Codificando vetor de tokens da biblioteca fasttext
embedding_matrix = np.zeros((len(ft.words), 300))
for i in tqdm(range(len(ft.words)), position=0, leave=True):
    embedding_matrix[i] = ft.get_word_vector(ft.words[i])

embedding_matrix.shape

100%|██████████████████████████████| 2000000/2000000 [01:32<00:00, 21586.38it/s]


(2000000, 300)

Limpeza dos dados textuais

In [39]:


sp = spacy.load('pt_core_news_sm')
all_stopwords = sp.Defaults.stop_words

def process_texts(texts): 

    processed_text = []
    dates = '^([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2})$'
    
    for text in texts:
        text = re.sub(r'\r\n|\r|\n', ' ', text) 
        text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text) 
        text = re.sub(dates, ' ', text) 
        text = re.sub('[ \t]+$', '', text)
        text = re.sub('\W', ' ', text)
        text = re.sub('[0-9]+', ' ', text)
        text = re.sub('\s+', ' ', text)
        text = ' '.join(e for e in text.split() if e.lower() not in all_stopwords) 
        processed_text.append(text.lower().strip())
        
    return processed_text

In [42]:
#Juntando o vetor da fasttext com os comentários da revisão

X_train_comment_preprocess = process_texts(X_train['review_comment_message'])
X_test_comment_preprocess = process_texts(X_test['review_comment_message'])

X_train['embedded_review_comment_message'] = list(map(ft.get_word_vector,X_train_comment_preprocess))
X_test['embedded_review_comment_message'] = list(map(ft.get_word_vector,X_test_comment_preprocess))

In [43]:
strn = StandardScaler()
strn.fit(X_train[['price','freight_value','product_photos_qty','product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'payment_value','purchase-delivery difference','estimated-actual delivery difference','purchase_delivery_diff_per_price']])
X_train_strn = strn.transform(X_train[['price','freight_value','product_photos_qty','product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'payment_value','purchase-delivery difference','estimated-actual delivery difference','purchase_delivery_diff_per_price']])
X_test_strn = strn.transform(X_test[['price','freight_value','product_photos_qty','product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'payment_value','purchase-delivery difference','estimated-actual delivery difference','purchase_delivery_diff_per_price']])

In [44]:
#Concatenando
X_train_final = np.concatenate((X_train_strn,X_train_resp_prod_cat,X_train_order_item,
       X_train_resp_payment_seq,X_train_payment_type,X_train_cat_price,X_train['review_availability'].values.reshape(-1,1),
       np.vstack(X_train['embedded_review_comment_message'].values)), axis=1)

X_test_final = np.concatenate((X_test_strn,X_test_resp_prod_cat, X_test_order_item,
       X_test_resp_payment_seq,X_test_payment_type,X_test_cat_price,X_test['review_availability'].values.reshape(-1,1),
       np.vstack(X_test['embedded_review_comment_message'].values)), axis=1)

In [47]:
#Salvando
with open('x_train_final.pickle', 'wb') as handle:
    pickle.dump(X_train_final, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('x_test_final.pickle', 'wb') as handle:
    pickle.dump(X_test_final, handle, protocol=pickle.HIGHEST_PROTOCOL)