In [52]:
#import relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier

In [53]:
df = pd.read_csv('data/syrialtel_cleaned_data.csv')

In [54]:
df.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,...,total_calls,total_charge,charge_per_min_day,charge_per_min_eve,charge_per_min_night,charge_per_min_dom,charge_per_min_int,%_int_min,call_satisfaction,region
0,KS,128,415,no,yes,25,265.1,110,45.07,197.4,...,303,75.56,0.409727,0.169495,0.120989,0.242867,0.9,1.394311,0.330033,Midwest
1,OH,107,415,no,yes,26,161.6,123,27.47,195.5,...,332,59.24,0.223333,0.161359,0.111165,0.168815,1.233333,2.191299,0.301205,Midwest
2,NJ,137,415,no,no,0,243.4,114,41.38,121.2,...,333,62.29,0.362982,0.093636,0.070385,0.179878,0.658,2.261772,0.0,Northeast
3,OH,84,408,yes,no,0,299.4,71,50.9,61.9,...,255,66.8,0.716901,0.059773,0.099551,0.262177,0.254286,1.168555,0.784314,Midwest
4,OK,75,415,yes,no,0,166.7,113,28.34,148.3,...,359,52.09,0.250796,0.103361,0.069504,0.138652,0.91,1.972656,0.835655,South


The data is split to a train and test data

In [55]:
#seperating the target and data
X = df.drop('churn', axis=1)
y = df['churn']

#creating a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Splitting the data between categorical and continuous

In [56]:
categorical = ['state','area_code', 'international_plan', 'voice_mail_plan', 'region']
df_cat = X_train[categorical]
df_cont = X_train.drop(categorical,axis = 1)

In [57]:
df_cont

Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,...,total_minutes,total_calls,total_charge,charge_per_min_day,charge_per_min_eve,charge_per_min_night,charge_per_min_dom,charge_per_min_int,%_int_min,call_satisfaction
2016,80,0,202.4,118,34.41,260.2,67,22.12,177.4,112,...,649.2,302,66.99,0.291610,0.330149,0.071250,0.217205,0.496000,1.417129,0.993377
1362,63,0,132.9,122,22.59,67.0,62,5.70,160.4,121,...,370.2,307,38.18,0.185164,0.091935,0.059669,0.116426,1.335000,2.674230,0.977199
2670,116,12,221.0,108,37.57,151.0,118,12.84,179.0,80,...,560.0,312,60.90,0.347870,0.108814,0.100750,0.191078,0.405000,1.607143,0.641026
2210,71,0,278.9,110,47.41,190.2,67,16.17,255.2,84,...,736.0,268,78.22,0.431000,0.241343,0.136667,0.287586,0.451429,1.589674,0.000000
1846,120,43,177.9,117,30.24,175.1,70,14.88,161.3,117,...,525.8,308,55.49,0.258462,0.212571,0.062051,0.172303,0.777500,2.187143,0.324675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,106,0,274.4,120,46.65,198.6,82,16.88,160.8,62,...,639.8,267,72.39,0.388750,0.205854,0.116774,0.268068,0.540000,0.937793,0.374532
1130,122,0,35.1,62,5.97,180.8,89,15.37,251.6,58,...,480.2,211,36.09,0.096290,0.172697,0.195172,0.156268,1.715000,2.644731,0.473934
1294,66,0,87.6,76,14.89,262.0,111,22.27,184.6,125,...,543.4,317,47.95,0.195921,0.200631,0.066480,0.145737,0.496000,1.693044,0.315457
860,169,0,179.2,111,30.46,175.2,130,14.89,228.6,92,...,592.9,339,58.31,0.274414,0.114538,0.111848,0.167087,0.445000,1.669759,0.589971


In [58]:
#splitting the numericals and categoricals
X_num = df_cont.columns
X_cat = df_cat.columns

## Pipeline

In [59]:
#preprocessing
numTrans = Pipeline(steps=[
    ('scaler', StandardScaler())
    ])
catTrans = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first',
                          sparse=False))
    ])

pp = ColumnTransformer(transformers=[
    ('num', numTrans, X_num),
    ('cat', catTrans, X_cat)
    ])

In [64]:
cat_names = pp.transformers_[1][1].named_steps['ohe'].get_feature_names(X_cat).tolist()
feature_names= list(X_num) + cat_names


In [70]:
def scores(actual, pred):
    print("log loss: ", log_loss(actual, pred))
    print("accuracy: ", accuracy_score(actual, pred))
    print("precision:", precision_score(actual, pred))
    print("recall:   ", recall_score(actual, pred))

## Model 1: Baseline Log Regression

In [66]:
X_train_scaled = pp.fit_transform(X_train)
X_test_scaled = pp.transform(X_test)

In [67]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = feature_names)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names)

In [69]:
X_train_scaled

Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,...,state_WI,state_WV,state_WY,area_code_415,area_code_510,international_plan_yes,voice_mail_plan_yes,region_Northeast,region_South,region_West
0,-0.511937,-0.585483,0.410716,0.873749,0.410877,1.167102,-1.646317,1.167733,-0.456961,0.615443,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.939002,-0.585483,-0.864713,1.074874,-0.865095,-2.639021,-1.895351,-2.637901,-0.790790,1.075039,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.392435,0.289180,0.752053,0.370935,0.752000,-0.984185,0.893834,-0.983076,-0.425542,-1.018675,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,-0.738030,-0.585483,1.814605,0.471498,1.814230,-0.211928,-1.646317,-0.211288,1.070799,-0.814411,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.492921,2.548723,-0.038896,0.823467,-0.039276,-0.509404,-1.496896,-0.510269,-0.773117,0.870774,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2328,0.141221,-0.585483,1.732023,0.974311,1.732188,-0.046444,-0.899214,-0.046732,-0.782936,-1.937867,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2329,0.543164,-0.585483,-2.659490,-1.942006,-2.659228,-0.397112,-0.550566,-0.396702,1.000106,-2.142132,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2330,-0.863638,-0.585483,-1.696036,-1.238068,-1.696312,1.202563,0.545186,1.202498,-0.315574,1.279303,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2331,1.723873,-0.585483,-0.015039,0.521779,-0.015527,-0.507434,1.491516,-0.507951,0.548455,-0.405881,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [68]:
logreg_baseline = LogisticRegression(random_state=42, max_iter=2000)

logreg_baseline.fit(X_train_scaled, y_train)

logreg_baseline_cv = cross_val_score(logreg_baseline, X_train_scaled, y_train).mean()
print(logreg_baseline_cv)

0.8636884138552169


In [51]:
print(logreg_baseline.score(X_train_scaled, y_train))
print(logreg_baseline.score(X_test_scaled, y_test))

0.8769824260608658
0.865
