In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### Features Data Types

In [2]:
column_types_X = {'z_age': 'float64', 'z_census_household_1p_pct': 'float64', 'z_census_education_high_pct': 'float64', 
                'z_census_purchase_household': 'float64', 'z_census_purchase_capita': 'float64', 
                'z_census_household_cnt': 'float64', 'multiplay_cnt': 'float64', 'z_line_cnt': 'float64', 
                'z_sim_cnt': 'float64', 'fixed_prod_cat1_ind': 'float64', 'tenure_fixed_month': 'float64', 
                'tenure_mobile_month': 'float64', 'z_line_voice_cat1_cnt': 'float64', 'fixed_data_cat1_ind': 'float64', 
                'fixed_data_cat2_ind': 'float64', 'z_fixed_prod_cat2_cnt': 'float64', 'z_fixed_prod_cat1_cnt': 'float64', 
                'z_fixed_data_cat3_cnt': 'float64', 'fixed_prod_cat3_cnt': 'float64', 'device_smartphone_cnt': 'float64', 
                'z_mobile_voice_cat1_cnt': 'float64', 'z_mobile_data_cat1_cnt': 'float64', 'mobile_data_cat2_cnt': 'float64', 
                'z_mobile_voice_cat3_cnt': 'float64', 'z_mobile_data_cat3_cnt': 'float64', 'z_usg_fv_3m_avg': 'float64', 
                'z_usg_fd_mb_1m_sum': 'float64', 'z_usg_fd_mb_3m_avg': 'float64', 'z_usg_mv_ib_a_3m_avg': 'float64', 
                'z_usg_md_sms_ib_a_3m_avg': 'float64', 'z_usg_md_ib_mb_3m_avg': 'float64', 
                'payment_method_cash_cnt': 'float64', 'z_rev_1m_sum': 'float64', 'z_device_netcube_cnt': 'float64', 
                'z_tariff_netcube_cnt': 'float64', 'z_min_Prog_Max_BB_Down': 'float64', 'z_line_Fib2h_CNT': 'float64', 
                'z_min_Speed_Product_KBit': 'float64', 'z_Max_Speed_Missing_KBit': 'float64', 
                'z_Min_Speed_Reserve_KBit': 'float64', 'z_Max_DSL_OOS_PCT': 'float64', 'z_PR_Relocation_CNT': 'float64', 
                'z_PR_Relocation_Days': 'float64', 'z_PR_ActivationSupportOpt_CNT': 'float64', 
                'z_PR_ActivationSupportOpt_Days': 'float64', 'z_PR_DeactivationThreat_CNT': 'float64', 
                'z_PR_DeactivationSupport_CNT': 'float64', 'z_PR_DeactivationProdOpt_CNT': 'float64', 
                'z_PR_DeactivationProdOpt_Days': 'float64', 'z_PR_OtherWOTopic_CNT': 'float64', 
                'z_PR_OtherWOTopic_Days': 'float64', 'z_PR_AddressChange_CNT': 'float64', 
                'z_PR_AddressChange_Days': 'float64', 'z_PR_ServiceDisruption_CNT': 'float64', 
                'z_PR_ServiceDisruption_Days': 'float64', 'z_PR_BasketSupport_CNT': 'float64', 
                'z_PR_BasketSupport_Days': 'float64', 'z_PR_SellingSalesSupport_CNT': 'float64', 
                'z_PR_SellingSalesSupport_Days': 'float64', 'z_PR_DigitalUsage_CNT': 'float64', 
                'z_PR_DigitalUsage_Days': 'float64', 'z_TNPS_Last_Days': 'float64', 'z_TNPS_Score_Avg': 'float64', 
                'province_cd_A': 'float64', 'province_cd_B': 'float64', 'province_cd_C': 'float64', 
                'province_cd_D': 'float64', 'province_cd_E': 'float64', 'province_cd_F': 'float64', 'province_cd_G': 'float64', 
                'province_cd_H': 'float64', 'province_cd_I': 'float64', 'Gender_CD_F': 'float64', 
                'prod_monodual_cd_D': 'float64', 'customer_value_cd_cat': 'float64'}

column_types_y = {'target_ind': 'float64'}

### Load Data

In [3]:
X_train = pd.read_csv('X_train.csv', dtype=column_types_X)
y_train = pd.read_csv('y_train.csv', dtype=column_types_y)
X_train['target_ind'] = y_train['target_ind']

X_val = pd.read_csv('X_val.csv', dtype=column_types_X)
y_val = pd.read_csv('y_val.csv', dtype=column_types_y)

### 80:20 Upsampling of Minority Class

In [4]:
# Seperate dataset by class
df_minority  = X_train[X_train['target_ind']==1]
df_majority = X_train[X_train['target_ind']==0]

# Upsample minority class
df_minority = df_minority.sample(n=int(0.25*len(df_majority)), replace=True)

# Merge the two classes into one dataset
df = pd.concat([df_majority,df_minority])

# Shuffle the dataset
df = df.sample(frac=1)
X_df = df.drop(['target_ind'], axis=1)
y_df = df[['target_ind']]

#### Random Forest (Unweighted Classes - 100 Estimators)

In [5]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(n_jobs=-1)

In [6]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 0.9999994605319976
Validation MCC: 0.47204549139170515


#### Random Forest (Unweighted Classes - 200 Estimators)

In [8]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=200)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(n_estimators=200, n_jobs=-1)

In [9]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.47341049893467363


#### Random Forest (Unweighted Classes - 300 Estimators)

In [11]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=300)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(n_estimators=300, n_jobs=-1)

In [12]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.47595010365911183


#### Random Forest (Unweighted Classes - 400 Estimators)

In [14]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=400)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(n_estimators=400, n_jobs=-1)

In [15]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.47682963114355886


#### Random Forest (Unweighted Classes - 500 Estimators)

In [17]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=500)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [18]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4764575240366148


#### AdaBoost (100 Estimators)

In [19]:
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_df, y_df['target_ind'])

AdaBoostClassifier(n_estimators=100)

In [20]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], ada.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], ada.predict(X_val))))

Training MCC: 0.24658057752900667
Validation MCC: 0.12162880357289275


#### AdaBoost (200 Estimators)

In [22]:
ada = AdaBoostClassifier(n_estimators=200)
ada.fit(X_df, y_df['target_ind'])

AdaBoostClassifier(n_estimators=200)

In [23]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], ada.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], ada.predict(X_val))))

Training MCC: 0.24920923433425915
Validation MCC: 0.1234940749595196


#### AdaBoost (300 Estimators)

In [25]:
ada = AdaBoostClassifier(n_estimators=300)
ada.fit(X_df, y_df['target_ind'])

AdaBoostClassifier(n_estimators=300)

In [26]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], ada.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], ada.predict(X_val))))

Training MCC: 0.25152901154854757
Validation MCC: 0.12401036412678232


#### AdaBoost (400 Estimators)

In [28]:
ada = AdaBoostClassifier(n_estimators=400)
ada.fit(X_df, y_df['target_ind'])

AdaBoostClassifier(n_estimators=400)

In [29]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], ada.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], ada.predict(X_val))))

Training MCC: 0.25193854060545084
Validation MCC: 0.12467735127454874


#### AdaBoost (500 Estimators)

In [31]:
ada = AdaBoostClassifier(n_estimators=500)
ada.fit(X_df, y_df['target_ind'])

AdaBoostClassifier(n_estimators=500)

In [32]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], ada.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], ada.predict(X_val))))

Training MCC: 0.2528830453658682
Validation MCC: 0.12452620779844584


#### Gradient Boosting

In [33]:
# gb = GradientBoostingClassifier()
# gb.fit(X_df, y_df['target_ind'])

In [34]:
# print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], gb.predict(X_df))))
# print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], gb.predict(X_val))))

#### kNN (Uniform Weights)

In [35]:
# knn = KNeighborsClassifier(n_jobs=-1)
# knn.fit(X_df, y_df['target_ind'])

In [36]:
# print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn.predict(X_df))))
# print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn.predict(X_val))))

#### kNN (Distance Weights - k=1)

In [37]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=1)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=1, weights='distance')

In [38]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.64689759936139


#### kNN (Distance Weights - k=2)

In [40]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=2)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=2, weights='distance')

In [41]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.64689759936139


#### kNN (Distance Weights - k=3)

In [43]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=3)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=3, weights='distance')

In [44]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.6065598034231751


#### kNN (Distance Weights - k=4)

In [46]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=4)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=4, weights='distance')

In [47]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.5985252208160837


#### kNN (Distance Weights - k=5)

In [49]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=5)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, weights='distance')

In [50]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.554950390164101


#### kNN (Distance Weights - k=6)

In [52]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=6)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=6, weights='distance')

In [53]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.5285117221838012


#### kNN (Distance Weights - k=7)

In [55]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=7)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=7, weights='distance')

In [56]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4995263457434928


#### kNN (Distance Weights - k=8)

In [58]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=8)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=8, weights='distance')

In [59]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4788475411385017


#### kNN (Distance Weights - k=9)

In [5]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=9)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=9, weights='distance')

In [6]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4564187809045722


#### kNN (Distance Weights - k=10)

In [7]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=10)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=10, weights='distance')

In [8]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4382415483835823
