In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
import warnings

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier


In [3]:
df = pd.read_csv("data/Customer_Churn.csv")
df.head()
df.shape

(7043, 21)

In [4]:
df.drop('customerID', axis='columns', inplace=True)

In [5]:
df['SeniorCitizen'] = ["No" if value == 0 else "Yes" for value in df['SeniorCitizen']]

In [6]:
df['TotalCharges'] = df['TotalCharges'].replace('', np.nan)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [7]:
df['tenure'] = [float(value) for value in df['tenure']]

In [8]:
m = df['TotalCharges'].mean()
df['TotalCharges'].fillna(m, inplace=True)

In [9]:
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   object 
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   float64
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [11]:
df.shape

(7043, 20)

In [12]:
for i in df:
    if df[i].dtype=='object':
        print(f'{i} : {df[i].unique()}')

gender : ['Female' 'Male']
SeniorCitizen : ['No' 'Yes']
Partner : ['Yes' 'No']
Dependents : ['No' 'Yes']
PhoneService : ['No' 'Yes']
MultipleLines : ['No phone service' 'No' 'Yes']
InternetService : ['DSL' 'Fiber optic' 'No']
OnlineSecurity : ['No' 'Yes' 'No internet service']
OnlineBackup : ['Yes' 'No' 'No internet service']
DeviceProtection : ['No' 'Yes' 'No internet service']
TechSupport : ['No' 'Yes' 'No internet service']
StreamingTV : ['No' 'Yes' 'No internet service']
StreamingMovies : ['No' 'Yes' 'No internet service']
Contract : ['Month-to-month' 'One year' 'Two year']
PaperlessBilling : ['Yes' 'No']
PaymentMethod : ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn : ['No' 'Yes']


In [13]:
for i in df:
    if df[i].dtype=='float64':
        print(f'{i} : {df[i].unique()}')

tenure : [ 1. 34.  2. 45.  8. 22. 10. 28. 62. 13. 16. 58. 49. 25. 69. 52. 71. 21.
 12. 30. 47. 72. 17. 27.  5. 46. 11. 70. 63. 43. 15. 60. 18. 66.  9.  3.
 31. 50. 64. 56.  7. 42. 35. 48. 29. 65. 38. 68. 32. 55. 37. 36. 41.  6.
  4. 33. 67. 23. 57. 61. 14. 20. 53. 40. 59. 24. 44. 19. 54. 51. 26.  0.
 39.]
MonthlyCharges : [29.85 56.95 53.85 ... 63.1  44.2  78.7 ]
TotalCharges : [  29.85 1889.5   108.15 ...  346.45  306.6  6844.5 ]


In [14]:
df.replace('No internet service', 'No', inplace=True)

In [15]:
df.replace('No phone service', 'No', inplace=True)

In [16]:
for i in df:
    if df[i].dtype == 'object':
        print(f"{i} : {df[i].unique()}")

gender : ['Female' 'Male']
SeniorCitizen : ['No' 'Yes']
Partner : ['Yes' 'No']
Dependents : ['No' 'Yes']
PhoneService : ['No' 'Yes']
MultipleLines : ['No' 'Yes']
InternetService : ['DSL' 'Fiber optic' 'No']
OnlineSecurity : ['No' 'Yes']
OnlineBackup : ['Yes' 'No']
DeviceProtection : ['No' 'Yes']
TechSupport : ['No' 'Yes']
StreamingTV : ['No' 'Yes']
StreamingMovies : ['No' 'Yes']
Contract : ['Month-to-month' 'One year' 'Two year']
PaperlessBilling : ['Yes' 'No']
PaymentMethod : ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn : ['No' 'Yes']


In [17]:
yes_no = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity',
        'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn','SeniorCitizen']

for i in yes_no:
    df[i].replace({'Yes':1, 'No':0}, inplace=True) 

In [18]:
df['gender'].replace({'Male' : 1,'Female':0}, inplace=True)

In [19]:
df = pd.get_dummies(data=df, columns=['InternetService', 'Contract', 'PaymentMethod'], dtype=int)

In [20]:
df.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1.0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,34.0,1,0,1,0,1,...,1,0,0,0,1,0,0,0,0,1


In [21]:
df.shape

(7043, 27)

In [22]:
cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = MinMaxScaler()
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

In [23]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,0.013889,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,0.472222,1,0,1,0,1,...,1,0,0,0,1,0,0,0,0,1
2,1,0,0,0,0.027778,1,0,1,1,0,...,1,0,0,1,0,0,0,0,0,1
3,1,0,0,0,0.625,0,0,1,0,1,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0,0.027778,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


In [24]:
X = df.drop('Churn', axis='columns')

In [25]:
Y = df['Churn']

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((5634, 26), (1409, 26))

In [27]:
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn import metrics

In [28]:
def evaluate_model(true, predicted):
    acc = metrics.accuracy_score(true, predicted)
    con = metrics.confusion_matrix(true, predicted, labels=[1, 0])
    return acc, con 

In [29]:
models = {
    "SVM": SVC(),
    "K Neighbours Classifier": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Gaussian NB":GaussianNB(),
    "Random Forest Classifier": RandomForestClassifier()
}

model_list = []
accuracy_list =[]

for i in range(len(list(models))):
    model =  list(models.values())[i]
    model.fit(X_train,Y_train)

    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_acc , model_train_con = evaluate_model(Y_train, Y_train_pred)

    model_test_acc , model_test_con = evaluate_model(Y_test, Y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model performance for Training set")
    print("- Accuracy: {:.4f}".format(model_train_acc))
    # print("- Confusion Matrix:",(model_train_con))

    print('----------------------------------')

    print("Model performance for Training set")
    print("- Accuracy: {:.4f}".format(model_test_acc))
    # print("- Confusion Matrix:",(model_test_con))
    
    accuracy_list.append(model_test_acc)

    print('='*35)
    print('\n')

SVM
Model performance for Training set
- Accuracy: 0.8174
----------------------------------
Model performance for Training set
- Accuracy: 0.8155


K Neighbours Classifier
Model performance for Training set
- Accuracy: 0.8362
----------------------------------
Model performance for Training set
- Accuracy: 0.7651


Logistic Regression
Model performance for Training set
- Accuracy: 0.8010
----------------------------------
Model performance for Training set
- Accuracy: 0.8219


Gaussian NB
Model performance for Training set
- Accuracy: 0.7442
----------------------------------
Model performance for Training set
- Accuracy: 0.7551


Random Forest Classifier
Model performance for Training set
- Accuracy: 0.9986
----------------------------------
Model performance for Training set
- Accuracy: 0.7906




In [30]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=["Accuracy"],ascending=False)

Unnamed: 0,Model Name,Accuracy
2,Logistic Regression,0.821859
0,SVM,0.815472
4,Random Forest Classifier,0.790632
1,K Neighbours Classifier,0.765082
3,Gaussian NB,0.755145


Logistic Regression

In [31]:
log_model = LogisticRegression(fit_intercept=True)
log_model = log_model.fit(X_train, Y_train)
y_pred = log_model.predict(X_test)
score =  metrics.accuracy_score(Y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 82.19
