## 1. Importing libraries and loading dataframe

In [18]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# importing dataframe from previous step and reset index
df_preprocessing = pd.read_csv('df_preprocessing.csv', index_col=0)
df_preprocessing = df_preprocessing.reset_index(drop=True)

## 2. Encoding target feature

In [3]:
df_preprocessing['Attrition_Flag'].value_counts()

Existing Customer    7771
Attrited Customer    1536
Name: Attrition_Flag, dtype: int64

In [4]:
# Encoding 'Attrition_Flag' attributes to binary 1 and 0. Existing Customer = 0, and Attrited Customer = 1
# df_preprocessing['Attrition_Flag'].replace({'Existing Customer':0, 'Attrited Customer':1}, inplace=True)
df_preprocessing['Attrition_Flag'] = df_preprocessing['Attrition_Flag'].map({'Existing Customer':0, 'Attrited Customer':1})

In [5]:
df_preprocessing.shape

(9307, 20)

## 3. Seperating target features and predictor features

In [30]:
X = df_preprocessing.drop('Attrition_Flag', axis=1)
y = df_preprocessing['Attrition_Flag']

In [33]:
X.shape

(9307, 19)

In [34]:
# Standard scaling to numerical columns
scaler = StandardScaler()

In [35]:
X_train[X_train.select_dtypes('number').columns] = scaler.fit_transform(X_train[X_train.select_dtypes('number').columns])
X_test[X_test.select_dtypes('number').columns] = scaler.fit_transform(X_test[X_test.select_dtypes('number').columns])

In [10]:
X.shape

(9307, 19)

In [11]:
X = pd.get_dummies(X, drop_first=True)

In [12]:
X.shape

(9307, 32)

In [13]:
X.head()

Unnamed: 0,Age,Dependent_Count,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,...,Marital_Status_Single,Marital_Status_Unknown,Income_$40K - $60K,Income_$60K - $80K,Income_$80K - $120K,Income_Less than $40K,Income_Unknown,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,-0.29577,-0.271926,0.004744,-0.552026,-1.395182,-0.408007,-0.490538,0.118049,-0.500932,3.449765,...,0,0,1,0,0,0,0,0,0,0
1,-0.547035,2.042137,-0.626022,0.73918,0.751624,-0.408007,-0.18519,0.387672,-0.220314,0.47641,...,0,1,0,0,0,0,0,0,0,0
2,1.337453,-0.271926,1.518584,0.73918,-0.321779,-0.408007,-0.666074,-0.576842,-0.613157,2.435005,...,0,0,0,0,0,1,0,0,0,0
3,-0.170138,-0.271926,0.130897,1.384783,-1.395182,-0.408007,0.675986,0.007749,0.674812,1.212929,...,0,0,0,0,0,0,1,0,0,0
4,0.081127,-1.04328,0.761664,0.73918,-0.321779,-2.260358,1.401885,0.795782,1.328548,0.885587,...,0,0,0,1,0,0,0,0,0,0


In [29]:
X.isna().sum()

Age                         0
Dependent_Count             0
Months_On_Book              0
Total_Relationship_Count    0
Months_Inactive             0
Contacts_Count              0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
Gender_M                    0
Education_Doctorate         0
Education_Graduate          0
Education_High School       0
Education_Post-Graduate     0
Education_Uneducated        0
Education_Unknown           0
Marital_Status_Married      0
Marital_Status_Single       0
Marital_Status_Unknown      0
Income_$40K - $60K          0
Income_$60K - $80K          0
Income_$80K - $120K         0
Income_Less than $40K       0
Income_Unknown              0
Card_Category_Gold          0
Card_Category_Platinum      0
Card_Category_Silver        0
dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=123)

In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Training Accuracy:', model.score(X_train, y_train))
print('Test Accuracy:', model.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

Training Accuracy: 0.8999078906969604
Test Accuracy: 0.9065520945220193
[[2265   77]
 [ 184  267]]


In [19]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Training Accuracy:', model.score(X_train, y_train))
print('Test Accuracy:', model.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred))

Training Accuracy: 1.0
Test Accuracy: 0.9369853204439671
[[2255   87]
 [  89  362]]
0.8044444444444444


In [27]:
#Cs = [0.001, 0.1, 1, 10, 100, 1000]
Cs = np.logspace(-5, 8, 15)
param_grid = {'C': Cs}

logreg = LogisticRegression()

logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

best_model = logreg_cv.fit(X_train, y_train)

print(best_model.best_params_)
print(best_model.best_score_)

{'C': 31.622776601683793}
0.8980641388830927


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Training Accuracy:', model.score(X_train, y_train))
print('Test Accuracy:', model.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred))

Training Accuracy: 1.0
Test Accuracy: 0.9577515216612961
[[2315   27]
 [  91  360]]
0.8591885441527446


In [21]:
X_train.to_csv('X_train1.csv')
X_test.to_csv('X_test1.csv')
y_train.to_csv('y_train1.csv')
y_test.to_csv('y_test1.csv')