In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate, KFold

In [2]:
''' Read one hot encoded Orders dataset'''
final_one_hot_df = pd.read_csv('clustering_data_one_hot.csv')
final_one_hot_df.head()

Unnamed: 0,order_count,price,product_category_name_english_agro_industry_and_commerce,product_category_name_english_air_conditioning,product_category_name_english_art,product_category_name_english_arts_and_craftmanship,product_category_name_english_audio,product_category_name_english_auto,product_category_name_english_baby,product_category_name_english_bed_bath_table,...,customer_state_PR,customer_state_RJ,customer_state_RN,customer_state_RO,customer_state_RR,customer_state_RS,customer_state_SC,customer_state_SE,customer_state_SP,customer_state_TO
0,2,43.98,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,114.89,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,59.99,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,869.97,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1,1390.0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [3]:
''' Read Orders dataset'''
final_df = pd.read_csv('clustering_data.csv')
final_df.head()

Unnamed: 0,product_category_name_english,order_purchase_year_month,customer_state,order_count,price
0,agro_industry_and_commerce,1,SP,2,43.98
1,agro_industry_and_commerce,2,MS,2,114.89
2,agro_industry_and_commerce,3,PR,1,59.99
3,agro_industry_and_commerce,5,PR,1,869.97
4,agro_industry_and_commerce,6,PR,1,1390.0


In [4]:
x_cols = ['order_count', 'price']
y_cols = []

for elem in final_one_hot_df.columns.values:
    if elem.startswith('product_category_name') or elem.startswith('order_purchase_year_month'):
        x_cols.append(elem)
    elif elem.startswith('customer_state'):
        y_cols.append(elem)



In [5]:
''' one hot encoded X '''
X = final_one_hot_df[x_cols]
print(X.shape)
X.head()

(1262, 87)


Unnamed: 0,order_count,price,product_category_name_english_agro_industry_and_commerce,product_category_name_english_air_conditioning,product_category_name_english_art,product_category_name_english_arts_and_craftmanship,product_category_name_english_audio,product_category_name_english_auto,product_category_name_english_baby,product_category_name_english_bed_bath_table,...,order_purchase_year_month_3,order_purchase_year_month_4,order_purchase_year_month_5,order_purchase_year_month_6,order_purchase_year_month_7,order_purchase_year_month_8,order_purchase_year_month_9,order_purchase_year_month_10,order_purchase_year_month_11,order_purchase_year_month_12
0,2,43.98,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,114.89,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,59.99,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,869.97,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,1390.0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [6]:
y = final_df[['customer_state']]
print(y.shape)
y.head()

(1262, 1)


Unnamed: 0,customer_state
0,SP
1,MS
2,PR
3,PR
4,PR


In [7]:
# ''' one hot encoded y '''
# y = final_one_hot_df[y_cols]
# print(y.shape)
# y.head()

In [8]:
''' Getting gaussian naive bayes '''
mnb = MultinomialNB()

In [9]:
''' Leave one out CV'''
scores = cross_validate(mnb, X, pd.factorize(y.values.ravel())[0], cv=KFold(n_splits=X.shape[0]), scoring=['accuracy', 'precision_micro'])
scores



{'fit_time': array([ 0.01097035,  0.00399137,  0.00399065, ...,  0.00099683,
         0.00199342,  0.00199342]),
 'score_time': array([ 0.00099683,  0.00099874,  0.0009973 , ...,  0.00099826,
         0.00099778,  0.00099802]),
 'test_accuracy': array([ 0.,  0.,  0., ...,  1.,  1.,  1.]),
 'test_precision_micro': array([ 0.,  0.,  0., ...,  1.,  1.,  1.]),
 'train_accuracy': array([ 0.60428232,  0.60428232,  0.60348929, ...,  0.60348929,
         0.60348929,  0.60348929]),
 'train_precision_micro': array([ 0.60428232,  0.60428232,  0.60348929, ...,  0.60348929,
         0.60348929,  0.60348929])}

In [11]:
np.mean(scores['train_accuracy'])



0.60376955375893404

In [None]:

''' splitting train test '''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


In [None]:
mnb_model = mnb.fit(X_train, y_train.values.ravel())

In [None]:
y_pred = mnb_model.predict(X_test)
print(y_pred.shape)

In [None]:
y_test = np.array(y_test.values.ravel())
y_test.shape

In [None]:
np.mean(y_test == y_pred)

In [None]:
np.mean(y_test != y_pred)