## 1. Importing libraries and loading dataframe

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# importing dataframe from previous step and reset index
df_preprocessing = pd.read_csv('df_preprocessing.csv', index_col=0)
df_preprocessing = df_preprocessing.reset_index(drop=True)

## 2. Encoding target feature

In [3]:
df_preprocessing['Attrition_Flag'].value_counts()

Existing Customer    7771
Attrited Customer    1536
Name: Attrition_Flag, dtype: int64

In [4]:
# Encoding 'Attrition_Flag' attributes to binary 1 and 0. Existing Customer = 0, and Attrited Customer = 1
# df_preprocessing['Attrition_Flag'].replace({'Existing Customer':0, 'Attrited Customer':1}, inplace=True)
df_preprocessing['Attrition_Flag'] = df_preprocessing['Attrition_Flag'].map({'Existing Customer':0, 'Attrited Customer':1})

In [5]:
df_preprocessing.shape

(9307, 20)

## 3. Seperating target features and predictor features

In [6]:
X = df_preprocessing.drop('Attrition_Flag', axis=1)
y = df_preprocessing['Attrition_Flag']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=123)

In [8]:
# Standard scaling to numerical columns
scaler = StandardScaler()

In [9]:
X_train[X_train.select_dtypes('number').columns] = scaler.fit_transform(X_train[X_train.select_dtypes('number').columns])
X_test[X_test.select_dtypes('number').columns] = scaler.fit_transform(X_test[X_test.select_dtypes('number').columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[X_train.select_dtypes('number').columns] = scaler.fit_transform(X_train[X_train.select_dtypes('number').columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[X_test.select_dtypes('number').columns] =

In [10]:
X.shape

(9307, 19)

In [11]:
X_train = pd.get_dummies(X_train, drop_first=True)

In [12]:
X_test = pd.get_dummies(X_test, drop_first=True)

In [13]:
X.head()

Unnamed: 0,Age,Gender,Dependent_Count,Education,Marital_Status,Income,Card_Category,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,44.0,M,2,Graduate,Married,$40K - $60K,Blue,36,3,1.0,2.0,4010.0,1247,2763.0,1.376,1088.0,24.0,0.846,0.311
1,42.0,M,5,Uneducated,Unknown,$120K +,Blue,31,5,3.0,2.0,6748.0,1467,5281.0,0.831,1201.0,42.0,0.68,0.217
2,57.0,F,2,Graduate,Married,Less than $40K,Blue,48,5,2.0,2.0,2436.0,680,1756.0,1.19,1570.0,29.0,0.611,0.279
3,45.0,F,2,Graduate,Married,Unknown,Blue,37,6,1.0,2.0,14470.0,1157,13313.0,0.966,1207.0,21.0,0.909,0.08
4,47.0,M,1,Doctorate,Divorced,$60K - $80K,Blue,42,5,2.0,0.0,20979.0,1800,19179.0,0.906,1178.0,27.0,0.929,0.086


In [14]:
X.isna().sum()

Age                         0
Gender                      0
Dependent_Count             0
Education                   0
Marital_Status              0
Income                      0
Card_Category               0
Months_On_Book              0
Total_Relationship_Count    0
Months_Inactive             0
Contacts_Count              0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64

In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Training Accuracy:', model.score(X_train, y_train))
print('Test Accuracy:', model.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

Training Accuracy: 0.8999078906969604
Test Accuracy: 0.9069101324740423
[[2261   81]
 [ 179  272]]


In [16]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Training Accuracy:', model.score(X_train, y_train))
print('Test Accuracy:', model.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred))

Training Accuracy: 1.0
Test Accuracy: 0.9312567132116004
[[2239  103]
 [  89  362]]
0.7903930131004366


In [17]:
Cs = [0.001, 0.1, 1, 10, 100, 1000]
# Cs = np.logspace(-5, 8, 15)
param_grid = {'C': Cs}

logreg = LogisticRegression()

logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

best_model = logreg_cv.fit(X_train, y_train)

print(best_model.best_params_)
print(best_model.best_score_)

{'C': 100}
0.8980641388830927


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Training Accuracy:', model.score(X_train, y_train))
print('Test Accuracy:', model.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred))

Training Accuracy: 1.0
Test Accuracy: 0.9588256355173649
[[2314   28]
 [  87  364]]
0.863582443653618


In [19]:
X_train.to_csv('X_train1.csv')
X_test.to_csv('X_test1.csv')
y_train.to_csv('y_train1.csv')
y_test.to_csv('y_test1.csv')