1. Download and prepare the data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle


data=pd.read_csv('/datasets/Churn.csv')
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             9091 non-null float64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB
None
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio 

Preprocessing data (filling blank values in Tenure column and changing the data type to int)

In [None]:
print(data.isnull().sum())
data['Tenure']=data['Tenure'].fillna(0)
data['Tenure']=data['Tenure'].astype(int)
data=data.drop(['Surname'], axis=1)
print(data.info())                 

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Geography            0
Gender               0
Age                  0
Tenure             909
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(9), o

One hot encoding to change the categorical values into numerical

In [None]:
data_ohe=pd.get_dummies(data, drop_first=True)
print(data_ohe.shape)

(10000, 14)


2. Examine the balance of classes. Train the model without taking into account the imbalance. Briefly describe your findings.

In [None]:
target=data_ohe['Exited']
features=data_ohe.drop('Exited', axis=1)
features_train, features_valid, target_train, target_valid=train_test_split(features, target, test_size=0.20, random_state=12345)
features_train, features_test, target_train, target_test=train_test_split(features_train, target_train, test_size=0.2, random_state=12345)
print(data_ohe['Exited'].value_counts())
print(features_train.shape)
print(target_train.shape)
print(features_valid.shape)
print(target_valid.shape)
print(features_test.shape)
print(target_test.shape)

model_LR=LogisticRegression(random_state=12345, solver='liblinear')
model_LR.fit(features_train, target_train)
predicted_valid_LR=model_LR.predict(features_valid)
print(f1_score(target_valid, predicted_valid_LR))

model_DT=DecisionTreeClassifier(random_state=12345)
model_DT.fit(features_train, target_train)
predicted_valid_DT=model_DT.predict(features_valid)
print(f1_score(target_valid, predicted_valid_DT))

0    7963
1    2037
Name: Exited, dtype: int64
(6400, 13)
(6400,)
(2000, 13)
(2000,)
(1600, 13)
(1600,)
0.0


  'precision', 'predicted', average, warn_for)


0.47183098591549294


The data is unbalanced with large number of actual negatives. 

Fixing class balance: Upsampling

In [None]:
def upsample(features, target, repeat):
    features_zeros=features[target==0]
    features_ones=features[target==1]
    target_zeros=target[target==0]
    target_ones=target[target==1]
    
    features_upsampled=pd.concat([features_zeros] + [features_ones]*repeat)
    target_upsampled=pd.concat([target_zeros] + [target_ones]*repeat)
    
    features_upsampled, target_upsampled=shuffle(features_upsampled, target_upsampled, random_state=12345)
    return features_upsampled, target_upsampled

features_upsampled, target_upsampled=upsample(features_train, target_train, 10)

model_up_LR=LogisticRegression(random_state=12345, solver='liblinear')
model_up_LR.fit(features_upsampled, target_upsampled)
predicted_up_LR=model_up_LR.predict(features_valid)
print(f1_score(target_valid, predicted_up_LR))

model_up_DT=DecisionTreeClassifier(random_state=12345, max_depth=10)
model_up_DT.fit(features_upsampled, target_upsampled)
predicted_up_DT=model_up_DT.predict(features_valid)
print(f1_score(target_valid, predicted_up_DT))

model_up_RF=RandomForestClassifier(n_estimators=100, max_depth=10, random_state=12345)
model_up_RF.fit(features_upsampled, target_upsampled)
predicted_up_RF=model_up_RF.predict(features_valid)
print(f1_score(target_valid, predicted_up_RF))



0.3518747424804285
0.5047318611987381
0.5530596436870643


Fixing class balance: downsampling

In [None]:
def downsample(features, target, fraction):
    features_zeros=features[target==0]
    features_ones=features[target==1]
    target_zeros=target[target==0]
    target_ones=target[target==1]
    
    features_downsampled=pd.concat([features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled=pd.concat([target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    
    features_downsampled, target_downsampled=shuffle(features_downsampled, target_downsampled, random_state=12345)
    return features_downsampled, target_downsampled

features_downsampled, target_downsampled=downsample(features_train, target_train, 0.1)

model_down_LR=LogisticRegression(random_state=12345, solver='liblinear')
model_down_LR.fit(features_downsampled, target_downsampled)
predicted_down_LR=model_down_LR.predict(features_valid)
print(f1_score(target_valid, predicted_down_LR))

model_down_DT=DecisionTreeClassifier(random_state=12345, max_depth=10)
model_down_DT.fit(features_downsampled, target_downsampled)
predicted_down_DT=model_down_DT.predict(features_valid)
print(f1_score(target_valid, predicted_down_DT))

model_down_RF=RandomForestClassifier(n_estimators=100, max_depth=10, random_state=12345)
model_down_RF.fit(features_downsampled, target_downsampled)
predicted_down_RF=model_down_RF.predict(features_valid)
print(f1_score(target_valid, predicted_down_RF))
    

0.3518747424804285
0.4700272479564032
0.4707246376811594


After upsampling and downsampling the dataset to improve class balance, 3 models were run on both the datasets. Random forest with upscaled data is by far the best model. Hence will fine tune it in the next step to reach the F1 score of 0.59.

In [None]:
for depth in range(10,20,1):
    model_select=RandomForestClassifier(random_state=12345, max_depth=depth, n_estimators=100)
    model_select.fit(features_upsampled, target_upsampled)
    predicted_select=model_select.predict(features_valid)
    print('depth:', depth, 'f1_score:',f1_score(target_valid, predicted_select))

depth: 10 f1_score: 0.5530596436870643
depth: 11 f1_score: 0.5779661016949151
depth: 12 f1_score: 0.6020128087831655
depth: 13 f1_score: 0.6158415841584158
depth: 14 f1_score: 0.6059957173447537
depth: 15 f1_score: 0.6020066889632107
depth: 16 f1_score: 0.6040428061831153
depth: 17 f1_score: 0.5987577639751552
depth: 18 f1_score: 0.600997506234414
depth: 19 f1_score: 0.6056701030927835


Max score of 0.61 is optained at a depth of 13, hence selected that model for final testing.

Final testing of data on test set

In [None]:
model_select1=RandomForestClassifier(random_state=12345, max_depth=13, n_estimators=100)
model_select1.fit(pd.concat([features_train]+[features_valid]), pd.concat([target_train]+[target_valid]))
pred=model_select1.predict(features_test)
print(f1_score(target_test, pred))

0.5679513184584178


In [None]:
probabilities_test=model_select1.predict_proba(features_test)
probabilities_test_one=probabilities_test[:,1]
auc_roc=roc_auc_score(target_test,probabilities_test_one)
print(auc_roc)

0.8505741861772916


Tried different depths for random forest and the f1 score is 0.61 at max_depth 13.

auc_roc score is 0.85 which is much higher then 0.5 which is the score of random model. That means our model has high quality than a random model.