In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### Features Data Types

In [2]:
column_types_X = {'z_age': 'float64', 'z_census_household_1p_pct': 'float64', 'z_census_education_high_pct': 'float64', 
                'z_census_purchase_household': 'float64', 'z_census_purchase_capita': 'float64', 
                'z_census_household_cnt': 'float64', 'multiplay_cnt': 'float64', 'z_line_cnt': 'float64', 
                'z_sim_cnt': 'float64', 'fixed_prod_cat1_ind': 'float64', 'tenure_fixed_month': 'float64', 
                'tenure_mobile_month': 'float64', 'z_line_voice_cat1_cnt': 'float64', 'fixed_data_cat1_ind': 'float64', 
                'fixed_data_cat2_ind': 'float64', 'z_fixed_prod_cat2_cnt': 'float64', 'z_fixed_prod_cat1_cnt': 'float64', 
                'z_fixed_data_cat3_cnt': 'float64', 'fixed_prod_cat3_cnt': 'float64', 'device_smartphone_cnt': 'float64', 
                'z_mobile_voice_cat1_cnt': 'float64', 'z_mobile_data_cat1_cnt': 'float64', 'mobile_data_cat2_cnt': 'float64', 
                'z_mobile_voice_cat3_cnt': 'float64', 'z_mobile_data_cat3_cnt': 'float64', 'z_usg_fv_3m_avg': 'float64', 
                'z_usg_fd_mb_1m_sum': 'float64', 'z_usg_fd_mb_3m_avg': 'float64', 'z_usg_mv_ib_a_3m_avg': 'float64', 
                'z_usg_md_sms_ib_a_3m_avg': 'float64', 'z_usg_md_ib_mb_3m_avg': 'float64', 
                'payment_method_cash_cnt': 'float64', 'z_rev_1m_sum': 'float64', 'z_device_netcube_cnt': 'float64', 
                'z_tariff_netcube_cnt': 'float64', 'z_min_Prog_Max_BB_Down': 'float64', 'z_line_Fib2h_CNT': 'float64', 
                'z_min_Speed_Product_KBit': 'float64', 'z_Max_Speed_Missing_KBit': 'float64', 
                'z_Min_Speed_Reserve_KBit': 'float64', 'z_Max_DSL_OOS_PCT': 'float64', 'z_PR_Relocation_CNT': 'float64', 
                'z_PR_Relocation_Days': 'float64', 'z_PR_ActivationSupportOpt_CNT': 'float64', 
                'z_PR_ActivationSupportOpt_Days': 'float64', 'z_PR_DeactivationThreat_CNT': 'float64', 
                'z_PR_DeactivationSupport_CNT': 'float64', 'z_PR_DeactivationProdOpt_CNT': 'float64', 
                'z_PR_DeactivationProdOpt_Days': 'float64', 'z_PR_OtherWOTopic_CNT': 'float64', 
                'z_PR_OtherWOTopic_Days': 'float64', 'z_PR_AddressChange_CNT': 'float64', 
                'z_PR_AddressChange_Days': 'float64', 'z_PR_ServiceDisruption_CNT': 'float64', 
                'z_PR_ServiceDisruption_Days': 'float64', 'z_PR_BasketSupport_CNT': 'float64', 
                'z_PR_BasketSupport_Days': 'float64', 'z_PR_SellingSalesSupport_CNT': 'float64', 
                'z_PR_SellingSalesSupport_Days': 'float64', 'z_PR_DigitalUsage_CNT': 'float64', 
                'z_PR_DigitalUsage_Days': 'float64', 'z_TNPS_Last_Days': 'float64', 'z_TNPS_Score_Avg': 'float64', 
                'province_cd_A': 'float64', 'province_cd_B': 'float64', 'province_cd_C': 'float64', 
                'province_cd_D': 'float64', 'province_cd_E': 'float64', 'province_cd_F': 'float64', 'province_cd_G': 'float64', 
                'province_cd_H': 'float64', 'province_cd_I': 'float64', 'Gender_CD_F': 'float64', 
                'prod_monodual_cd_D': 'float64', 'customer_value_cd_cat': 'float64'}

column_types_y = {'target_ind': 'float64'}

### Load Data

In [3]:
X_train = pd.read_csv('X_train.csv', dtype=column_types_X)
y_train = pd.read_csv('y_train.csv', dtype=column_types_y)
X_train['target_ind'] = y_train['target_ind']

X_val = pd.read_csv('X_val.csv', dtype=column_types_X)
y_val = pd.read_csv('y_val.csv', dtype=column_types_y)

### 80:20 Upsampling of Minority Class

In [4]:
# Seperate dataset by class
df_minority  = X_train[X_train['target_ind']==1]
df_majority = X_train[X_train['target_ind']==0]

# Upsample minority class
df_minority = df_minority.sample(n=int(0.25*len(df_majority)), replace=True)

# Merge the two classes into one dataset
df = pd.concat([df_majority,df_minority])

# Shuffle the dataset
df = df.sample(frac=1)
X_df = df.drop(['target_ind'], axis=1)
y_df = df[['target_ind']]

X_train = X_train.drop(['target_ind'], axis=1)

#### Random Forest (Unweighted Classes - 100 Estimators, 1 Feature)

In [5]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=1)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=1, n_jobs=-1)

In [6]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4727837067771322


#### Random Forest (Unweighted Classes - 100 Estimators, 2 Features)

In [7]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=2)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=2, n_jobs=-1)

In [8]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.47263501762864085


#### Random Forest (Unweighted Classes - 100 Estimators, 3 Features)

In [9]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=3)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=3, n_jobs=-1)

In [10]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4705722110796282


#### Random Forest (Unweighted Classes - 100 Estimators, 4 Features)

In [11]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=4)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=4, n_jobs=-1)

In [12]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.46834085438749967


#### Random Forest (Unweighted Classes - 100 Estimators, 5 Features)

In [13]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=5)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=5, n_jobs=-1)

In [14]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4656411987569379


#### Random Forest (Unweighted Classes - 100 Estimators, 6 Features)

In [15]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=6)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=6, n_jobs=-1)

In [16]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4710733446079685


#### Random Forest (Unweighted Classes - 100 Estimators, 7 Features)

In [17]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=7)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=7, n_jobs=-1)

In [18]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.46829136884441847


#### Random Forest (Unweighted Classes - 100 Estimators, 8 Features)

In [19]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=8)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=8, n_jobs=-1)

In [20]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4716227122684182


#### Random Forest (Unweighted Classes - 100 Estimators, 9 Features)

In [21]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=9)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=9, n_jobs=-1)

In [22]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.473895267347791


#### Random Forest (Unweighted Classes - 100 Estimators, 10 Features)

In [23]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=10)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=10, n_jobs=-1)

In [24]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.47745274693969675


#### Random Forest (Unweighted Classes - 100 Estimators, 20 Features)

In [25]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=20)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=20, n_jobs=-1)

In [26]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.48972738073271244


#### Random Forest (Unweighted Classes - 100 Estimators, 30 Features)

In [27]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=30)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=30, n_jobs=-1)

In [28]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4956455947732996


#### Random Forest (Unweighted Classes - 100 Estimators, 40 Features)

In [29]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=40)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=40, n_jobs=-1)

In [30]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4979560989802467


#### Random Forest (Unweighted Classes - 100 Estimators, 50 Features)

In [31]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=50)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=50, n_jobs=-1)

In [32]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.5020857803558916


#### Random Forest (Unweighted Classes - 100 Estimators, 60 Features)

In [33]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=60)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=60, n_jobs=-1)

In [34]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.5033670438152144


#### Random Forest (Unweighted Classes - 100 Estimators, 70 Features)

In [35]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=70)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=70, n_jobs=-1)

In [36]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.5029412672867142


#### Random Forest (Unweighted Classes - 100 Estimators, 75 Features)

In [37]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=75)
rfc.fit(X_df, y_df['target_ind'])

RandomForestClassifier(max_features=75, n_jobs=-1)

In [38]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], rfc.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], rfc.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.5029808116651366


#### kNN (Distance Weights - k=50)

In [39]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=50)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=50, weights='distance')

In [40]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.2869463154820552


#### kNN (Distance Weights - k=100)

In [41]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=100)
knn_d.fit(X_df, y_df['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=100, weights='distance')

In [42]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], knn_d.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.24774625950287915


#### kNN (Distance - Original Data k = 1)

In [5]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=1)
knn_d.fit(X_train, y_train['target_ind'])

(4720388, 75)
(5792742, 75)


KNeighborsClassifier(n_jobs=-1, n_neighbors=1, weights='distance')

In [6]:
print("Training MCC: " + str(matthews_corrcoef(y_train['target_ind'], knn_d.predict(X_train))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.64689759936139


#### kNN (Distance - Original Data k = 2)

In [7]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=2)
knn_d.fit(X_train, y_train['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=2, weights='distance')

In [8]:
print("Training MCC: " + str(matthews_corrcoef(y_train['target_ind'], knn_d.predict(X_train))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.64689759936139


#### kNN (Distance - Original Data k = 3)

In [9]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=3)
knn_d.fit(X_train, y_train['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=3, weights='distance')

In [10]:
print("Training MCC: " + str(matthews_corrcoef(y_train['target_ind'], knn_d.predict(X_train))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.5421032617905686


#### kNN (Distance - Original Data k = 4)

In [11]:
knn_d = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=4)
knn_d.fit(X_train, y_train['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=4, weights='distance')

In [12]:
print("Training MCC: " + str(matthews_corrcoef(y_train['target_ind'], knn_d.predict(X_train))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn_d.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.4884746931682935


#### Decision Tree (Unweighted) - Original Data

In [14]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train['target_ind'])

DecisionTreeClassifier()

In [15]:
print("Training MCC: " + str(matthews_corrcoef(y_train['target_ind'], dt.predict(X_train))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], dt.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.32686248182042915


#### Decision Tree (Weighted) - Original Data

In [16]:
dt_b = DecisionTreeClassifier(class_weight='balanced')
dt_b.fit(X_train, y_train['target_ind'])

DecisionTreeClassifier(class_weight='balanced')

In [17]:
print("Training MCC: " + str(matthews_corrcoef(y_train['target_ind'], dt_b.predict(X_train))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], dt_b.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.32388078722368885


#### Decision Tree (Unweighted) - RUS80:20

In [5]:
dt = DecisionTreeClassifier()
dt.fit(X_df, y_df['target_ind'])

DecisionTreeClassifier()

In [6]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], dt.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], dt.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.3424808188734605


#### Decision Tree (Weighted) - RUS80:20

In [20]:
dt_b = DecisionTreeClassifier(class_weight='balanced')
dt_b.fit(X_df, y_df['target_ind'])

DecisionTreeClassifier(class_weight='balanced')

In [21]:
print("Training MCC: " + str(matthews_corrcoef(y_df['target_ind'], dt_b.predict(X_df))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], dt_b.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.33027593982684883


#### kNN (Uniform - Original Data k = 1)

In [22]:
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=1)
knn.fit(X_train, y_train['target_ind'])

KNeighborsClassifier(n_jobs=-1, n_neighbors=1)

In [23]:
print("Training MCC: " + str(matthews_corrcoef(y_train['target_ind'], knn.predict(X_train))))
print("Validation MCC: " + str(matthews_corrcoef(y_val['target_ind'], knn.predict(X_val))))

Training MCC: 1.0
Validation MCC: 0.64689759936139


#### Decision Tree (Unweighted) - RUS80:20 - Prune

In [7]:
path = dt.cost_complexity_pruning_path(X_df, y_df['target_ind'])
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [17]:
clfs = []
mccs = []

In [18]:
for ccp_alpha in ccp_alphas[::3125]:
    clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
    clf.fit(X_df, y_df['target_ind'])
    clfs.append(clf)
    mccs.append(matthews_corrcoef(y_val['target_ind'], clf.predict(X_val)))


In [19]:
mccs

[0.34227964270808764,
 0.3394573896695017,
 0.3357476265880495,
 0.3312096267236257,
 0.3282855542301862,
 0.321811309974493,
 0.31425905184348124,
 0.3052097715281729,
 0.2982934292516812,
 0.28912812840177665,
 0.2794504588319622,
 0.2697960331639499,
 0.2600285511863893,
 0.24999217619873348,
 0.2387138108589632,
 0.22573624471819947,
 0.2100120960825727,
 0.1933671370041696,
 0.17357528179539392,
 0.15062978851421804,
 0.13243684289058164]