In [23]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle

from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### Features Data Types

In [2]:
column_types_X = {'z_age': 'float64', 'z_census_household_1p_pct': 'float64', 'z_census_education_high_pct': 'float64', 
                'z_census_purchase_household': 'float64', 'z_census_purchase_capita': 'float64', 
                'z_census_household_cnt': 'float64', 'multiplay_cnt': 'float64', 'z_line_cnt': 'float64', 
                'z_sim_cnt': 'float64', 'fixed_prod_cat1_ind': 'float64', 'tenure_fixed_month': 'float64', 
                'tenure_mobile_month': 'float64', 'z_line_voice_cat1_cnt': 'float64', 'fixed_data_cat1_ind': 'float64', 
                'fixed_data_cat2_ind': 'float64', 'z_fixed_prod_cat2_cnt': 'float64', 'z_fixed_prod_cat1_cnt': 'float64', 
                'z_fixed_data_cat3_cnt': 'float64', 'fixed_prod_cat3_cnt': 'float64', 'device_smartphone_cnt': 'float64', 
                'z_mobile_voice_cat1_cnt': 'float64', 'z_mobile_data_cat1_cnt': 'float64', 'mobile_data_cat2_cnt': 'float64', 
                'z_mobile_voice_cat3_cnt': 'float64', 'z_mobile_data_cat3_cnt': 'float64', 'z_usg_fv_3m_avg': 'float64', 
                'z_usg_fd_mb_1m_sum': 'float64', 'z_usg_fd_mb_3m_avg': 'float64', 'z_usg_mv_ib_a_3m_avg': 'float64', 
                'z_usg_md_sms_ib_a_3m_avg': 'float64', 'z_usg_md_ib_mb_3m_avg': 'float64', 
                'payment_method_cash_cnt': 'float64', 'z_rev_1m_sum': 'float64', 'z_device_netcube_cnt': 'float64', 
                'z_tariff_netcube_cnt': 'float64', 'z_min_Prog_Max_BB_Down': 'float64', 'z_line_Fib2h_CNT': 'float64', 
                'z_min_Speed_Product_KBit': 'float64', 'z_Max_Speed_Missing_KBit': 'float64', 
                'z_Min_Speed_Reserve_KBit': 'float64', 'z_Max_DSL_OOS_PCT': 'float64', 'z_PR_Relocation_CNT': 'float64', 
                'z_PR_Relocation_Days': 'float64', 'z_PR_ActivationSupportOpt_CNT': 'float64', 
                'z_PR_ActivationSupportOpt_Days': 'float64', 'z_PR_DeactivationThreat_CNT': 'float64', 
                'z_PR_DeactivationSupport_CNT': 'float64', 'z_PR_DeactivationProdOpt_CNT': 'float64', 
                'z_PR_DeactivationProdOpt_Days': 'float64', 'z_PR_OtherWOTopic_CNT': 'float64', 
                'z_PR_OtherWOTopic_Days': 'float64', 'z_PR_AddressChange_CNT': 'float64', 
                'z_PR_AddressChange_Days': 'float64', 'z_PR_ServiceDisruption_CNT': 'float64', 
                'z_PR_ServiceDisruption_Days': 'float64', 'z_PR_BasketSupport_CNT': 'float64', 
                'z_PR_BasketSupport_Days': 'float64', 'z_PR_SellingSalesSupport_CNT': 'float64', 
                'z_PR_SellingSalesSupport_Days': 'float64', 'z_PR_DigitalUsage_CNT': 'float64', 
                'z_PR_DigitalUsage_Days': 'float64', 'z_TNPS_Last_Days': 'float64', 'z_TNPS_Score_Avg': 'float64', 
                'province_cd_A': 'float64', 'province_cd_B': 'float64', 'province_cd_C': 'float64', 
                'province_cd_D': 'float64', 'province_cd_E': 'float64', 'province_cd_F': 'float64', 'province_cd_G': 'float64', 
                'province_cd_H': 'float64', 'province_cd_I': 'float64', 'Gender_CD_F': 'float64', 
                'prod_monodual_cd_D': 'float64', 'customer_value_cd_cat': 'float64'}

column_types_y = {'target_ind': 'float64'}

### Load Data

In [3]:
X_df = pd.read_csv('X_df.csv', dtype=column_types_X)
y_df = pd.read_csv('y_df.csv', dtype=column_types_y)

X_test = pd.read_csv('X_test.csv', dtype=column_types_X)
y_test = pd.read_csv('y_test.csv', dtype=column_types_y)

X_val = pd.read_csv('X_val.csv', dtype=column_types_X)
y_val = pd.read_csv('y_val.csv', dtype=column_types_y)

#### kNN (Uniform - Original Data k = 1)

In [5]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=1)
knn.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=1)

In [6]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn.predict(X_df))))
print("Test MCC: " + str(matthews_corrcoef(y_test['target_ind'], knn.predict(X_test))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn.predict(X_val))))
print("Test Acc: " + str(knn.score(X_test, y_test['target_ind'])))

Training MCC: 1.0
Test MCC: 0.022159037654062165
Validation MCC: 0.64689759936139
Test Acc: 0.9635831339926452


In [7]:
print(confusion_matrix(y_test['target_ind'], knn.predict(X_test)))

[[404511   7222]
 [  8078    324]]


#### Random Forest (Unweighted Classes - 100 Estimators, 8 Features)

In [24]:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=8)
rf.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=8, n_jobs=-1)

In [25]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rf.predict(X_df))))
print("Test MCC: " + str(matthews_corrcoef(y_test['target_ind'], rf.predict(X_test))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rf.predict(X_val))))
print("Test Acc: " + str(rf.score(X_test, y_test['target_ind'])))

Training MCC: 1.0
Test MCC: 0.11837048720728412
Validation MCC: 0.47409293746347475
Test Acc: 0.9801516179323313


In [26]:
print(confusion_matrix(y_test['target_ind'], rf.predict(X_test)))

[[411586    147]
 [  8192    210]]


#### Random Forest (Weighted Classes - 100 Estimators, 8 Features)

In [11]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=8, class_weight='balanced')
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(class_weight='balanced', max_features=8, n_jobs=-1)

In [12]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Test MCC: " + str(matthews_corrcoef(y_test['target_ind'], rfc.predict(X_test))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))
print("Test Acc: " + str(rfc.score(X_test, y_test['target_ind'])))

Training MCC: 1.0
Test MCC: 0.11652817921169854
Validation MCC: 0.44980001285310495
Test Acc: 0.9801516179323313


In [13]:
print(confusion_matrix(y_test['target_ind'], rfc.predict(X_test)))

[[411594    139]
 [  8200    202]]


#### AdaBoost - Max Depth = 1 Base

In [14]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100)
ada.fit(X_df, y_df['target_ind'])

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   n_estimators=100)

In [15]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], ada.predict(X_df))))
print("Test MCC: " + str(matthews_corrcoef(y_test['target_ind'], ada.predict(X_test))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], ada.predict(X_val))))
print("Test Acc: " + str(ada.score(X_test, y_test['target_ind'])))

Training MCC: 0.24680135272748138
Test MCC: 0.12806107960256968
Validation MCC: 0.12215095521101325
Test Acc: 0.9653420924226737


In [16]:
print(confusion_matrix(y_test['target_ind'], ada.predict(X_test)))

[[404332   7401]
 [  7160   1242]]


#### Gaussian Naive Bayes

In [17]:
gnb = GaussianNB()
gnb.fit(X_df, y_df['target_ind'])

GaussianNB()

In [18]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], gnb.predict(X_df))))
print("Test MCC: " + str(matthews_corrcoef(y_test['target_ind'], gnb.predict(X_test))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], gnb.predict(X_val))))
print("Test Acc: " + str(gnb.score(X_test, y_test['target_ind'])))

Training MCC: 0.16685748071357595
Test MCC: 0.0651466060571797
Validation MCC: 0.060537939496191635
Test Acc: 0.8232234876884811


In [19]:
print(confusion_matrix(y_test['target_ind'], gnb.predict(X_test)))

[[342992  68741]
 [  5529   2873]]


#### Dummy

In [20]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_df, y_df['target_ind'])

DummyClassifier(strategy='most_frequent')

In [21]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], dummy_clf.predict(X_df))))
print("Test MCC: " + str(matthews_corrcoef(y_test['target_ind'], dummy_clf.predict(X_test))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], dummy_clf.predict(X_val))))
print("Test Acc: " + str(dummy_clf.score(X_test, y_test['target_ind'])))

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Training MCC: 0.0
Test MCC: 0.0
Validation MCC: 0.0
Test Acc: 0.9800016661311245


In [27]:
pickle.dump( knn, open( "knn.p", "wb" ) )
pickle.dump( rf, open( "rf.p", "wb" ) )
pickle.dump( rfc, open( "rfc.p", "wb" ) )
pickle.dump( ada, open( "ada.p", "wb" ) )
pickle.dump( gnb, open( "gnb.p", "wb" ) )
pickle.dump( dummy_clf, open( "dummy_clf.p", "wb" ) )
X_df.to_csv('X_df.csv', index=False)
y_df.to_csv('y_df.csv', index=False)