In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

In [5]:
df = pd.read_csv('model_data.csv')
X = df.drop(columns=['Churn'])
y = df['Churn']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state=100)

In [8]:
f_scores = SelectKBest(score_func=chi2, k = 'all')
f_scores.fit(X_train, y_train)
for i in range(len(f_scores.scores_)):
	print('Feature %d: %f' % (i, f_scores.scores_[i]))


Feature 0: 44.789579
Feature 1: 75.237768
Feature 2: 83.768537
Feature 3: 20.286132
Feature 4: 8.321895
Feature 5: 81.072509
Feature 6: 64.351491
Feature 7: 66.410921
Feature 8: 251.871650
Feature 9: 35.725345
Feature 10: 232.649177
Feature 11: 185.805563
Feature 12: 106.365272
Feature 13: 287.433480


All features have a high test statistic. For a significane level of 0.05 and 13 degrees of freedom, the critical value is 22.362. Therefore the 2 features which are not statistcally significant are features 3 and 4 (OnlineBackup and DeviceProtection), so they can be dropped for the training of the model.

In [9]:
X_train.head()

Unnamed: 0,Partner,Dependents,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,PaperlessBilling,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year
3342,0,0,0,1,1,0,1,0,1,0,1,0,1,0
2878,1,1,0,0,0,0,0,0,0,1,0,1,0,1
4186,0,0,0,0,0,0,0,1,0,0,0,1,0,0
3106,1,1,0,1,1,0,1,0,1,0,1,0,0,1
3474,1,0,0,0,0,0,0,1,0,0,0,1,0,1


In [10]:
X_train.drop(columns=['OnlineBackup', 'DeviceProtection'], inplace = True)
X_test.drop(columns=['OnlineBackup', 'DeviceProtection'], inplace = True)

## Model Training

In [15]:
#logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
f1_lr = f1_score(y_test, y_pred)
y_pred_train = lr.predict(X_train)
f1_lr_train = f1_score(y_train, y_pred_train)
print('Train F1-score %f, Test F1-score %f' % (f1_lr_train, f1_lr))

Train F1-score 0.564251, Test F1-score 0.566738


In [16]:
#decision tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
f1_lr = f1_score(y_test, y_pred)
y_pred_train = dt.predict(X_train)
f1_lr_train = f1_score(y_train, y_pred_train)
print('Train F1-score %f, Test F1-score %f' % (f1_lr_train, f1_lr))

Train F1-score 0.591977, Test F1-score 0.490975


In [17]:
#random forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
f1_lr = f1_score(y_test, y_pred)
y_pred_train = rf.predict(X_train)
f1_lr_train = f1_score(y_train, y_pred_train)
print('Train F1-score %f, Test F1-score %f' % (f1_lr_train, f1_lr))

Train F1-score 0.614603, Test F1-score 0.519178


In [19]:
#naive bayes
GNB = GaussianNB()
GNB.fit(X_train, y_train)
y_pred = GNB.predict(X_test)
f1_lr = f1_score(y_test, y_pred)
y_pred_train = GNB.predict(X_train)
f1_lr_train = f1_score(y_train, y_pred_train)
print('Train F1-score %f, Test F1-score %f' % (f1_lr_train, f1_lr))

Train F1-score 0.614780, Test F1-score 0.602537


## Further Steps

1. Hyper-parameter tune RF and DT to fix overfitting.
2. SMOTE to fix the disbalance of the classes to create a better performing model
3. Present other metrics as well such as confusion matrix, ROC AUC, Classification Report