In [68]:
%config Completer.use_jedi = False

In [69]:
import pandas as pd
from etl_process import DataETLManager

etl_manager = DataETLManager(root_dir='./Data', csv_file='OnlineRetail.csv')
etl_manager.extract_data()
etl_manager.transform_data()

dataset = etl_manager.data_transfomed
dataset.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [None]:
# Time frame defined for the Life time value prediction: 8 months

# Identify the features we are using to build the ML model

# Calculate the LTV for training the ML Model

# Build and Run the model

In [70]:
# English Users:
from datetime import date, datetime

uk = dataset.query("Country=='United Kingdom'").reset_index(drop=True)

uk_users_3m = uk[(uk['InvoiceDate'].dt.date >= date(2010, 12, 1)) & (uk['InvoiceDate'].dt.date < date(2011, 3, 1))].reset_index(drop=True)
uk_users_8m = uk[(uk['InvoiceDate'].dt.date >= date(2011, 4, 1)) & (uk['InvoiceDate'].dt.date < date(2011, 12, 1))].reset_index(drop=True)

print('3 months: 90 Dyas: ', uk_users_3m['InvoiceDate'].max() - uk_users_3m['InvoiceDate'].min())
print('8 months: 240 Days: ', uk_users_8m['InvoiceDate'].max() - uk_users_8m['InvoiceDate'].min())

3 months: 90 Dyas:  89 days 08:38:00
8 months: 240 Days:  243 days 09:20:00


In [72]:
from sklearn.cluster import KMeans

customers = pd.DataFrame(uk_users_3m['CustomerID'].unique())
customers.columns = ['CustomerID']

# Generating Recency for the UK Users 3 month:

## Recency ##
aggregatR = {'InvoiceDate': 'max'}
customers['LastPurchaseDate'] = uk_users_3m.groupby(['CustomerID'], as_index=False).agg(aggregatR)['InvoiceDate']

## Frequency ##
aggregatF = {'InvoiceDate': 'count'}
freq = uk_users_3m.groupby('CustomerID', as_index=False).agg(aggregatF)

## MonetaryValue ##
uk_users_3m['Profit'] = uk_users_3m['UnitPrice'] * uk_users_3m['Quantity']
aggregatMV = {'Profit': 'sum'}
mv = uk_users_3m.groupby('CustomerID', as_index=False).agg(aggregatMV)

# Generating RFM Values
customers['Recency'] = (customers['LastPurchaseDate'].max() - customers['LastPurchaseDate']).dt.days
customers = pd.merge(customers, freq, on='CustomerID')
customers.columns = ['CustomerID', 'lastPurchase', 'Recency', 'Frequency']
customers = pd.merge(customers, mv, on='CustomerID')
customers.columns = ['CustomerID', 'lastPurchase', 'Recency', 'Frequency', 'MonetaryValue']

customers

Unnamed: 0,CustomerID,lastPurchase,Recency,Frequency,MonetaryValue
0,17850.0,2011-02-28 17:04:00,0,309,5303.48
1,13047.0,2011-01-18 10:17:00,41,44,822.58
2,13748.0,2011-01-20 14:01:00,39,1,204.00
3,15100.0,2011-02-28 13:31:00,0,6,635.10
4,15291.0,2011-01-17 12:34:00,42,35,1329.95
...,...,...,...,...,...
1568,14056.0,2010-12-08 13:38:00,82,125,582.37
1569,15656.0,2011-01-21 17:04:00,38,15,242.95
1570,16518.0,2010-12-16 15:39:00,74,11,190.70
1571,17165.0,2011-01-25 12:34:00,34,10,158.68


In [73]:
# Calculating the clusters:
#order cluster method
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

# KMeans for Recency:
kmeans = KMeans(n_clusters=4)
kmeans.fit(customers[['Recency']])
customers['RecencyCluster'] = kmeans.predict(customers[['Recency']])

customers = order_cluster('RecencyCluster', 'Recency', customers, False)

# KMeans for Frequency
kmeans = KMeans(n_clusters=4)
kmeans.fit(customers[['Frequency']])
customers['FrequencyCluster'] = kmeans.predict(customers[['Frequency']])

customers = order_cluster('FrequencyCluster', 'Frequency', customers, False)

# KMeans for Monetary Value
kmeans = KMeans(n_clusters=4)
kmeans.fit(customers[['MonetaryValue']])
customers['MonetaryCluster'] = kmeans.predict(customers[['MonetaryValue']])

customers = order_cluster('MonetaryCluster', 'MonetaryValue', customers, False)

customers

Unnamed: 0,CustomerID,lastPurchase,Recency,Frequency,MonetaryValue,RecencyCluster,FrequencyCluster,MonetaryCluster
0,17850.0,2011-02-28 17:04:00,0,309,5303.48,3,2,2
1,13408.0,2011-02-10 12:32:00,18,91,5587.27,3,2,2
2,13767.0,2011-02-14 15:37:00,14,113,4130.47,3,2,2
3,13694.0,2011-02-15 10:41:00,13,133,10758.16,3,2,2
4,13798.0,2011-02-18 14:54:00,10,140,10494.99,3,2,2
...,...,...,...,...,...,...,...,...
1568,18102.0,2011-01-12 09:33:00,47,24,38370.09,1,3,1
1569,15749.0,2011-01-10 09:56:00,49,5,22998.40,1,3,1
1570,16029.0,2011-02-04 12:36:00,24,58,19362.27,2,3,1
1571,13089.0,2010-12-08 14:46:00,82,456,14933.02,0,1,1


In [74]:
# Forming the RFM Overall Score:
customers['RFMScore'] = customers['RecencyCluster'] + customers['FrequencyCluster'] + customers['MonetaryCluster']
customers['UserSegment'] = 'Low'

# User Classification regarding the RFM Score:
customers.loc[customers['RFMScore'] <= 2, 'UserSegment'] = 'Low'
customers.loc[customers['RFMScore'] > 2, 'UserSegment'] = 'Mid'
customers.loc[customers['RFMScore'] > 5, 'UserSegment'] = 'High'

customers

Unnamed: 0,CustomerID,lastPurchase,Recency,Frequency,MonetaryValue,RecencyCluster,FrequencyCluster,MonetaryCluster,RFMScore,UserSegment
0,17850.0,2011-02-28 17:04:00,0,309,5303.48,3,2,2,7,High
1,13408.0,2011-02-10 12:32:00,18,91,5587.27,3,2,2,7,High
2,13767.0,2011-02-14 15:37:00,14,113,4130.47,3,2,2,7,High
3,13694.0,2011-02-15 10:41:00,13,133,10758.16,3,2,2,7,High
4,13798.0,2011-02-18 14:54:00,10,140,10494.99,3,2,2,7,High
...,...,...,...,...,...,...,...,...,...,...
1568,18102.0,2011-01-12 09:33:00,47,24,38370.09,1,3,1,5,Mid
1569,15749.0,2011-01-10 09:56:00,49,5,22998.40,1,3,1,5,Mid
1570,16029.0,2011-02-04 12:36:00,24,58,19362.27,2,3,1,6,High
1571,13089.0,2010-12-08 14:46:00,82,456,14933.02,0,1,1,2,Low


In [32]:
uk_users_8m['Profit'] = uk_users_8m['UnitPrice'] * uk_users_8m['Quantity']

aggr = {'Profit': 'sum'}
customer_pred = uk_users_8m.groupby('CustomerID', as_index=False).agg(aggr)

customer_pred.columns = ['CustomerID', 'LTV']
customer_pred

Unnamed: 0,CustomerID,LTV
0,0.0,891564.90
1,12747.0,2437.42
2,12748.0,21475.36
3,12749.0,3105.14
4,12820.0,561.53
...,...,...
3452,18278.0,173.90
3453,18281.0,80.82
3454,18282.0,98.76
3455,18283.0,1522.83


In [75]:
# Positive correlation is quite clear between RFMScore and LTV: High RFMScore ==> HIgh LTV

# The Goal for 8Month LTV user prediction: Cluster the Users depending on LTV 
# After clustering them by LTV use classification segments as more actionable parameters:

# Clustering 8M users on LTV: => High LTV, Mid LTV, Low LTV 

# Filtering some outliers out:
customers_pred = customer_pred[customer_pred['LTV'] < customer_pred['LTV'].quantile(0.80)]

kmeans = KMeans(n_clusters=3)
kmeans.fit(customer_pred[['LTV']])
customer_pred['LTVCluster'] = kmeans.predict(customer_pred[['LTV']])

customer_pred = order_cluster('LTVCluster', 'LTV', customer_pred, False)

customer_pred

Unnamed: 0,CustomerID,LTV,LTVCluster
0,0.0,891564.90,0
1,12747.0,2437.42,2
2,12748.0,21475.36,2
3,12749.0,3105.14,2
4,12820.0,561.53,2
...,...,...,...
3452,18282.0,98.76,2
3453,18283.0,1522.83,2
3454,18287.0,1837.28,2
3455,17450.0,152742.31,1


In [35]:
customer_pred.groupby('LTVCluster').describe()

Unnamed: 0_level_0,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,CustomerID,LTV,LTV,LTV,LTV,LTV,LTV,LTV,LTV
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
LTVCluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,891564.9,,891564.9,891564.9,891564.9,891564.9,891564.9
1,2.0,17776.0,461.033621,17450.0,17613.0,17776.0,17939.0,18102.0,2.0,176207.925,33185.390982,152742.31,164475.1175,176207.925,187940.7325,199673.54
2,3454.0,15563.633758,1573.541667,12747.0,14212.5,15581.5,16920.5,18287.0,3454.0,1291.850647,3107.112699,-4287.63,266.9725,561.46,1291.7825,55388.17


In [36]:
# Merging the two sets, 3M, 8M:

customer_pred_clone = customer_pred.copy()

In [37]:
customer_pred_clone = customer_pred_clone.fillna(0)
customer_pred_clone = customer_pred_clone[:1573]

In [76]:
# Merging the two datasets:

customers = pd.merge(customers, customer_pred_clone, on='CustomerID', how='left')
customers = customers.fillna(0)

In [77]:
customers

Unnamed: 0,CustomerID,lastPurchase,Recency,Frequency,MonetaryValue,RecencyCluster,FrequencyCluster,MonetaryCluster,RFMScore,UserSegment,LTV,LTVCluster
0,17850.0,2011-02-28 17:04:00,0,309,5303.48,3,2,2,7,High,0.00,0.0
1,13408.0,2011-02-10 12:32:00,18,91,5587.27,3,2,2,7,High,19019.50,2.0
2,13767.0,2011-02-14 15:37:00,14,113,4130.47,3,2,2,7,High,11945.61,2.0
3,13694.0,2011-02-15 10:41:00,13,133,10758.16,3,2,2,7,High,40712.00,2.0
4,13798.0,2011-02-18 14:54:00,10,140,10494.99,3,2,2,7,High,20567.20,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1568,18102.0,2011-01-12 09:33:00,47,24,38370.09,1,3,1,5,Mid,0.00,0.0
1569,15749.0,2011-01-10 09:56:00,49,5,22998.40,1,3,1,5,Mid,0.00,0.0
1570,16029.0,2011-02-04 12:36:00,24,58,19362.27,2,3,1,6,High,0.00,0.0
1571,13089.0,2010-12-08 14:46:00,82,456,14933.02,0,1,1,2,Low,33387.18,2.0


In [78]:
# First step: Convert Categorical variables to Numerical,in this case UserSegment:
classification = pd.get_dummies(customers)
classification

Unnamed: 0,CustomerID,lastPurchase,Recency,Frequency,MonetaryValue,RecencyCluster,FrequencyCluster,MonetaryCluster,RFMScore,LTV,LTVCluster,UserSegment_High,UserSegment_Low,UserSegment_Mid
0,17850.0,2011-02-28 17:04:00,0,309,5303.48,3,2,2,7,0.00,0.0,1,0,0
1,13408.0,2011-02-10 12:32:00,18,91,5587.27,3,2,2,7,19019.50,2.0,1,0,0
2,13767.0,2011-02-14 15:37:00,14,113,4130.47,3,2,2,7,11945.61,2.0,1,0,0
3,13694.0,2011-02-15 10:41:00,13,133,10758.16,3,2,2,7,40712.00,2.0,1,0,0
4,13798.0,2011-02-18 14:54:00,10,140,10494.99,3,2,2,7,20567.20,2.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1568,18102.0,2011-01-12 09:33:00,47,24,38370.09,1,3,1,5,0.00,0.0,0,0,1
1569,15749.0,2011-01-10 09:56:00,49,5,22998.40,1,3,1,5,0.00,0.0,0,0,1
1570,16029.0,2011-02-04 12:36:00,24,58,19362.27,2,3,1,6,0.00,0.0,1,0,0
1571,13089.0,2010-12-08 14:46:00,82,456,14933.02,0,1,1,2,33387.18,2.0,0,1,0


In [40]:
# Second step: Display the Correlation between features:
matx_corr = classification.corr()
matx_corr

Unnamed: 0,CustomerID,Recency,Frequency,MonetaryValue,RecencyCluster,FrequencyCluster,MonetaryCluster,RFMScore,LTV,LTVCluster,UserSegment_High,UserSegment_Low,UserSegment_Mid
CustomerID,1.0,-0.037367,-0.23653,-0.23222,0.030354,0.000239,0.099134,0.045025,-0.271961,-0.688184,0.023359,-0.195331,0.012898
Recency,-0.037367,1.0,0.039431,0.035649,-0.974345,0.049241,0.019511,-0.906539,0.038206,0.004862,-0.25873,0.05679,0.252254
Frequency,-0.23653,0.039431,1.0,0.979672,-0.030741,-0.242802,-0.379636,-0.160307,0.992259,-0.015151,-0.145236,0.713716,0.013723
MonetaryValue,-0.23222,0.035649,0.979672,1.0,-0.027526,-0.260662,-0.522037,-0.18559,0.979645,-0.008873,-0.183889,0.719465,0.051921
RecencyCluster,0.030354,-0.974345,-0.030741,-0.027526,1.0,-0.057297,-0.02513,0.927648,-0.02911,-0.00303,0.254623,-0.048108,-0.249709
FrequencyCluster,0.000239,0.049241,-0.242802,-0.260662,-0.057297,1.0,0.344075,0.286861,-0.231403,-0.022547,0.469037,-0.229267,-0.433606
MonetaryCluster,0.099134,0.019511,-0.379636,-0.522037,-0.02513,0.344075,1.0,0.237445,-0.406691,-0.022381,0.251267,-0.419324,-0.176677
RFMScore,0.045025,-0.906539,-0.160307,-0.18559,0.927648,0.286861,0.237445,1.0,-0.159927,-0.012952,0.416099,-0.179392,-0.389169
LTV,-0.271961,0.038206,0.992259,0.979645,-0.02911,-0.231403,-0.406691,-0.159927,1.0,0.031213,-0.146132,0.726029,0.012325
LTVCluster,-0.688184,0.004862,-0.015151,-0.008873,-0.00303,-0.022547,-0.022381,-0.012952,0.031213,1.0,0.000449,0.010267,-0.002382


In [41]:
# Show correlation in terms of LVTClusters:
matx_corr['LTVCluster']

CustomerID         -0.688184
Recency             0.004862
Frequency          -0.015151
MonetaryValue      -0.008873
RecencyCluster     -0.003030
FrequencyCluster   -0.022547
MonetaryCluster    -0.022381
RFMScore           -0.012952
LTV                 0.031213
LTVCluster          1.000000
UserSegment_High    0.000449
UserSegment_Low     0.010267
UserSegment_Mid    -0.002382
Name: LTVCluster, dtype: float64

In [42]:
# Defining our X, Y training and testing:
X = classification.drop(['LTV', 'LTVCluster', 'lastPurchase'], axis=1)
Y = classification['LTVCluster'] # Target

Unnamed: 0,CustomerID,lastPurchase,Recency,Frequency,MonetaryValue,RecencyCluster,FrequencyCluster,MonetaryCluster,RFMScore,UserSegment_High,UserSegment_Low,UserSegment_Mid
0,17850.0,2011-02-28 17:04:00,0,309,5303.48,3,2,2,7,1,0,0
1,13767.0,2011-02-14 15:37:00,14,113,4130.47,3,2,2,7,1,0,0
2,13694.0,2011-02-15 10:41:00,13,133,10758.16,3,2,2,7,1,0,0
3,13798.0,2011-02-18 14:54:00,10,140,10494.99,3,2,2,7,1,0,0
4,14680.0,2011-02-11 15:43:00,17,111,8149.14,3,2,2,7,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1568,18102.0,2011-01-12 09:33:00,47,24,38370.09,1,3,1,5,0,0,1
1569,15749.0,2011-01-10 09:56:00,49,5,22998.40,1,3,1,5,0,0,1
1570,16029.0,2011-02-04 12:36:00,24,58,19362.27,2,3,1,6,1,0,0
1571,13089.0,2010-12-08 14:46:00,82,456,14933.02,0,1,1,2,0,1,0


In [79]:
# Setting up the project and Initializing: 
neptune.init(project_qualified_name='aymane.hachcham/CaseStudyOnlineRetail')

Project(aymane.hachcham/CaseStudyOnlineRetail)

In [80]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

# Creating the Neptune experiment for XGboost training:
import neptune
from neptunecontrib.monitoring.xgboost import neptune_callback

params = {
    'max_depth':5,
    'learning_rate':0.1,
    'objective': 'multi:softprob',
    'n_jobs':-1, 
    'num_class':3
}

neptune.create_experiment(
    name='XGBoost-V4',
    tags=['XGBoost', 'Version4'],
    params=params
)

https://ui.neptune.ai/aymane.hachcham/CaseStudyOnlineRetail/e/CAS-22


Experiment(CAS-22)

In [81]:
# Split the Data in two sets: Train and Eval
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.05, random_state=56) 

In [82]:
# DMatrix for XGBoost:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [83]:
watchlist = [(dtest, 'test'), (dtrain, 'train')]
num_round = 20

In [48]:
# Traininig the Model:
# Three types of training: Using xgb.train, xgb.cv, XGB sklearn API

# 1 Method: using xgb.train():
xgb.train(params, dtrain, num_round, watchlist, callbacks=[neptune_callback()])

neptune.stop()



[0]	test-mlogloss:0.98289	train-mlogloss:0.98204
[1]	test-mlogloss:0.88648	train-mlogloss:0.88520
[2]	test-mlogloss:0.80443	train-mlogloss:0.80322
[3]	test-mlogloss:0.73527	train-mlogloss:0.73265
[4]	test-mlogloss:0.67513	train-mlogloss:0.67152
[5]	test-mlogloss:0.62250	train-mlogloss:0.61840
[6]	test-mlogloss:0.57609	train-mlogloss:0.57182
[7]	test-mlogloss:0.53671	train-mlogloss:0.53073
[8]	test-mlogloss:0.50204	train-mlogloss:0.49447
[9]	test-mlogloss:0.47121	train-mlogloss:0.46236
[10]	test-mlogloss:0.44302	train-mlogloss:0.43352
[11]	test-mlogloss:0.41765	train-mlogloss:0.40795
[12]	test-mlogloss:0.39478	train-mlogloss:0.38500
[13]	test-mlogloss:0.37555	train-mlogloss:0.36477
[14]	test-mlogloss:0.35648	train-mlogloss:0.34644
[15]	test-mlogloss:0.33935	train-mlogloss:0.32934
[16]	test-mlogloss:0.32584	train-mlogloss:0.31426
[17]	test-mlogloss:0.31346	train-mlogloss:0.30016
[18]	test-mlogloss:0.30398	train-mlogloss:0.28772
[19]	test-mlogloss:0.29295	train-mlogloss:0.27653


In [57]:
# Create a second experiment with sklearn API:
params3 = {
    'max_depth':5,
    'learning_rate':0.1,
    'objective': 'multi:softprob',
    'n_jobs':-1, 
    'num_class':3, 
    'eta':0.5,
    'gamma': 0.1,
    'lambda':1,
    'alpha':0.35, 
}

neptune.create_experiment(
    name='XGBoost-V3',
    tags=['XGBoost', 'Version3', 'Sklearn Api'],
    params=params3
)

https://ui.neptune.ai/aymane.hachcham/CaseStudyOnlineRetail/e/CAS-11


Experiment(CAS-11)

In [84]:
# Generate the XGBoost Classifier with xgb object:
multi_class_XGB = xgb.XGBClassifier(**params3)
multi_class_XGB.fit(
    x_train, 
    y_train, 
    eval_set=[(x_test, y_test)],
    callbacks=[neptune_callback()])

neptune.stop()

[0]	validation_0-mlogloss:0.98311
[1]	validation_0-mlogloss:0.88658
[2]	validation_0-mlogloss:0.80496




[3]	validation_0-mlogloss:0.73410
[4]	validation_0-mlogloss:0.67372
[5]	validation_0-mlogloss:0.62098
[6]	validation_0-mlogloss:0.57495
[7]	validation_0-mlogloss:0.53604
[8]	validation_0-mlogloss:0.50182
[9]	validation_0-mlogloss:0.47030
[10]	validation_0-mlogloss:0.44326
[11]	validation_0-mlogloss:0.41767
[12]	validation_0-mlogloss:0.39525
[13]	validation_0-mlogloss:0.37578
[14]	validation_0-mlogloss:0.36005
[15]	validation_0-mlogloss:0.34358
[16]	validation_0-mlogloss:0.32915
[17]	validation_0-mlogloss:0.31717
[18]	validation_0-mlogloss:0.30602
[19]	validation_0-mlogloss:0.29689
[20]	validation_0-mlogloss:0.28928
[21]	validation_0-mlogloss:0.28212
[22]	validation_0-mlogloss:0.27509
[23]	validation_0-mlogloss:0.26834
[24]	validation_0-mlogloss:0.26159
[25]	validation_0-mlogloss:0.25633
[26]	validation_0-mlogloss:0.25182
[27]	validation_0-mlogloss:0.24758
[28]	validation_0-mlogloss:0.24366
[29]	validation_0-mlogloss:0.23989
[30]	validation_0-mlogloss:0.23748
[31]	validation_0-mlogloss:

In [62]:
# print Accuracy on training and testing datasets:
print('Accuracy on Trainig Set: ', multi_class_XGB.score(x_train, y_train))
print('Accuracy on Testing Set: ', multi_class_XGB.score(x_test[x_train.columns], y_test))

Accuracy on Trainig Set:  0.9598393574297188
Accuracy on Testing Set:  0.9113924050632911


In [65]:
# Check the predictions:
predict = multi_class_XGB.predict(x_test)
predict

array([0., 0., 0., 0., 2., 2., 0., 0., 0., 0., 2., 2., 2., 2., 2., 0., 0.,
       2., 2., 0., 0., 2., 2., 2., 0., 0., 2., 0., 2., 0., 0., 0., 2., 0.,
       0., 0., 0., 2., 2., 2., 0., 2., 0., 2., 0., 2., 2., 0., 2., 0., 0.,
       0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 2., 2., 2., 2., 0., 2.,
       0., 2., 0., 0., 2., 2., 2., 2., 0., 2., 0.])

In [67]:
# Check the classification report: 
from sklearn.metrics import classification_report,confusion_matrix

print(classification_report(y_test, predict))

              precision    recall  f1-score   support

         0.0       0.98      0.88      0.92        49
         2.0       0.83      0.97      0.89        30

    accuracy                           0.91        79
   macro avg       0.90      0.92      0.91        79
weighted avg       0.92      0.91      0.91        79

