In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Loading the file

In [None]:
df = pd.read_csv("Telco-Customer-Churn.csv")

# EDA

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

##### Churn is the target variable. Rest all are the predictor variables. However, the data types of almost all the variables are not appropriate. Hence, they all needs to be changed.

##### pd.to_numeric function gave a string coercing error. Seemed like the total charges column had 11 blank values with a space in the cell. Therefore, it didn't even got identified as nulls. Those 11 records were manually made as nulls in the excel file. And then, the conversion function was applied to the variables.

#### It was more appropriate to convert most of the predictor variables except TotalCharges,MonthlyCharges and tenure to category data type.

## -> Data type conversion

In [None]:
df = df.astype({"customerID":'category',
 "gender":'category',
 "SeniorCitizen":'category',
 "Partner":'category',
 "Dependents":'category',
 "PhoneService":'category',
 "MultipleLines":'category',
 "InternetService":'category',
 "OnlineSecurity":'category',
 "OnlineBackup":'category',
 "DeviceProtection":'category',
 "TechSupport":'category',
 "StreamingTV":'category',
 "StreamingMovies":'category',
 "Contract":'category',
 "PaperlessBilling":'category',
 "PaymentMethod":'category',
 "Churn":'category'})
print(df.info())

## ->Null Replacement

In [None]:
print(df.isnull().sum())

##### The total charges column has 11 nulls. This is a very small proportion of the total. Therefore, these records could be dropped or imputed using the mean value. We have done mean imputation.

In [None]:
df.TotalCharges.fillna(df.TotalCharges.mean(), inplace=True)

## ->Outlier Detection

In [None]:
numerical_features=["MonthlyCharges","TotalCharges","tenure"]
num_df = df[numerical_features]
num_df.describe()

q1=num_df.quantile(0.25)
q3=num_df.quantile(0.75)
IQR = q3 - q1
IQR

((num_df< (q1-(1.5*IQR))) | (num_df> (q3+(1.5*IQR)))).any()

##### No outliers are present in the numerical columns!

## -> Factors of Predictors

##### We could look into the unique records of each of the columns. 
##### This would give us an idea about the type of variables (Binary categorical predictors ,Multiple value categorical predictors,Numerical Predictors).

In [None]:
for i in df.columns:
    print(f"Count of Unique {i}:{df[i].nunique()}\n")
    print(f"{df[i].unique()}\n")

##### By looking at the previous output we could see that there are 6 binary categorical predictors excluding the Churn variable(Target Variable). Apart from that there are 10 multiple value categorical predictors.

## -> Correlation Analysis

In [None]:
categorical_columns = df.select_dtypes(include=['category']).columns
continous_columns = df.select_dtypes(exclude=['category']).columns

print(continous_columns)
print(categorical_columns)

In [None]:
df_corr = df[categorical_columns].apply(lambda x: pd.factorize(x)[0])
plt.figure(figsize=(15,10))
plt.title('Correlation heatmap of categorical variables')
sns.heatmap(df_corr.corr(), annot=True)

In [None]:
plt.figure(figsize=(10,4))
df_cont = df[continous_columns]
plt.title('Correlation heatmap of numerical variables')
sns.heatmap(df_cont.corr(), annot=True)

## -> Target variable analysis

In [None]:
df.Churn.value_counts()

In [None]:
def bar_plot(df,column):    
    ax = sns.countplot(y=column, data=df)
    plt.title('Distribution of classes')
    plt.xlabel('Number of Axles')
    total = len(df[column])
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))
    plt.show()  
bar_plot(df, "Churn")

##### We see that there is a class imbalance in the target variable. The success class Churn = Yes holds only 26.5 % of the total share.
##### Undersampling or Over Sampling has to be done to balance the dataset before training the model.

## -> Binary Categorical  variable analysis

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(12, 7), sharey=True)
sns.countplot(x="gender", data=df, ax=axes[0,0])
sns.countplot(x="SeniorCitizen", data=df, ax=axes[0,1])
sns.countplot(x="Partner", data=df, ax=axes[0,2])
sns.countplot(x="Dependents", data=df, ax=axes[1,0])
sns.countplot(x="PhoneService", data=df, ax=axes[1,1])
sns.countplot(x="PaperlessBilling", data=df, ax=axes[1,2])

##### There exists a little bit of class imbalance in the dependents variable. And a huge class imbalance exists in the senior citizen and phone service variables.

##### We cannot come to any conclusion about the impact of these imbalance on the target variable. However we could analyse the target variable with each of these binary categorical variables. That could throw some light on the underlying insights.

##### We encode the values with 1 and 0, so that we could apply some math on these variables.

###  ***********************************Label Encoding************************

In [None]:
churn_numeric = {'Yes':1, 'No':0}
df.Churn.replace(churn_numeric, inplace=True)

In [None]:
columns = df.columns 
binary_cat_cols = [] 
for col in columns:
    if df[col].value_counts().shape[0] == 2:
        binary_cat_cols.append(col)
        
binary_cat_cols=binary_cat_cols[0:6]

for i in binary_cat_cols:
    print(df[[i,'Churn']].groupby([i]).mean())
    print('\n')

##### The type of gender doesn't seem to play a huge role in churning since the churn rate is the same for both gender.
##### The PhoneService variable plays a tiny role in determining churning since there is only a small difference in churn rate between the factors. 
##### Rest all the Binary Categorical variables like SeniorCitizen, Partner, Dependents, PaperlessBilling impacts Churning considerably.

##### Therefore, gender and Phone service variable could be removed from our analysis.

## -> Multiple categorical variable analysis

## 1. Internet Service variable analysis

In [None]:
sns.countplot(x="InternetService", data=df)

##### Most of them seem to have Fiber optic connection. 

In [None]:
df[['InternetService','Churn']].groupby('InternetService').mean()

##### We see that most of them from the fiber optic connection have churned. There may be numerous resons for this. 
##### However, we could do analysis using the data available. We could check if the fiber optic connection is costly, which has lead to churning.

In [None]:
df[['InternetService','MonthlyCharges']].groupby('InternetService').mean()

##### YES!!!! Cost associated with fiber optic seems high. This could be a reason for churning.

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(12, 7), sharey=True)
sns.countplot(x="StreamingTV", data=df, ax=axes[0,0])
sns.countplot(x="StreamingMovies", data=df, ax=axes[0,1])
sns.countplot(x="OnlineSecurity", data=df, ax=axes[0,2])
sns.countplot(x="OnlineBackup", data=df, ax=axes[1,0])
sns.countplot(x="DeviceProtection", data=df, ax=axes[1,1])
sns.countplot(x="TechSupport", data=df, ax=axes[1,2])

##### There exists no class imbalance among each of these predictors. As a next step, their impact on target variable could be analysed.

In [None]:
Internet_cat_cols=['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
                   'StreamingTV','StreamingMovies']

for i in Internet_cat_cols:
    print(df[[i,'Churn']].groupby([i]).mean())
    print('\n')

##### Streaming TV and Streaming Movies exhibits almost  same splitup in determining churning. Similar case exists with Online security, Tech support and with Online backup, Device Protection. These pairs  could be correlated and could lead to multi collinearity. By analyzing the correlation analysis we did above, we find that their corralation is not more than 0.9. Therefore, We could keep any one of them if needed or include both in our analysis.

## 2. Phone service variable analysis

In [None]:
df.PhoneService.value_counts()

In [None]:
df.MultipleLines.value_counts()

##### The yes and no value from the Phoneservice variable is accomodated by default within the Multiple Lines variable. Therefore, we could ignore the  Phone service variable and use the Multiple lines variable henceforth.

In [None]:
df[['MultipleLines','Churn']].groupby('MultipleLines').mean()

##### Each of the factors almost has the same rate of churning. Therefore, Including this variable in training the model would not do much help.
##### Since we have already decided to drop phone service variable, we could keep this variable hoping that it adds some value to the model.

## 3. Contract Analysis

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x="Contract", data=df)

##### There exists more number of month to month contract customers. Usually, They have the highest possibility to churn because they can leave whenever they wish by the month end. Others have to wait for year long contract to end. We analyse if this hypothesis is true.

In [None]:
df[['Contract','Churn']].groupby('Contract').mean()

##### YES!!! MONTH TO MONTH CONTRACT CUSTOMERS HAVE THE HIGHEST CHURN RATE.

## 4. Payment Analysis

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x="PaymentMethod", data=df)

##### There are more number of electronic check payment customers.

In [None]:
df[['PaymentMethod','Churn']].groupby('PaymentMethod').mean()

##### Surprisingly, the payment method which has the most number of customers is associated with the most number of churning.

## -> Numerical variable analysis

In [None]:
fig, axes = plt.subplots(1,2, figsize=(12, 7))
sns.histplot(x= df["tenure"], ax=axes[0])
sns.histplot(x= df["MonthlyCharges"], ax=axes[1])

##### MORE NUMBER OF CUSTOMERS LIE ON BOTH THE ENDS OF THE TENURE. We could say that many are new customers and they leave the contract soon. Also few customers stay for a long term.  The distribution drops in the middle portion. The same trend is seen in the distribution of monthly charges.

In [None]:
df[['tenure','MonthlyCharges','Churn']].groupby('Churn').mean()

##### Tenure is high for the people who don't churn and Monthly charges are low for the people who don't churn. These two variales have some effect on the churn rate.

In [None]:
df[['Contract','tenure']].groupby('Contract').mean()

##### Tenure has some positive correlation with contract. We could use either one of these variables while training the model.


## -> RESULTS

### We have decided to drop the following predictors after EDA.

##### 1. Customer ID
##### 2. Gender
##### 3. Contract
##### 4. Total Charges
##### 5. Phone Service

### Reason:
##### 1. It is logically inappropriate to use customer ID to predict Customer churn.
##### 2. Gender had the same proportion split up for males and females in the number of churned customers. Therefore, Gender hasn't played a role in causing customer churn.
##### 3. When we analysed the Contract and tenure together, we found that  customers with long term contract stays for more tenure and those with short term contracts stay for short tenure. They seemed positively correlated. Therefore, we remove it to avoid redundancy.
##### 4. On exploring the dataset, we found that total charges was proportional to the product of tenure and monthly charges. Therefore, we remove it to avoid redundancy.
##### 5. The data in the phone service column is duplicated in the multiple phone service column along with an extra factor. Therefore, it is best to use the predictor with more detail. Therefore, we drop the phone service variable.

In [None]:
df.drop(['customerID','gender','Contract',
         'TotalCharges','PhoneService'], axis=1, inplace=True)

# Data Pre-processing

## -> Encoding (One Hot Encoding)

In [None]:
df1 = pd.get_dummies(df, 
                     columns=['SeniorCitizen', 'Partner', 'Dependents',
                              'MultipleLines','InternetService','OnlineSecurity',
                              'OnlineBackup','DeviceProtection','TechSupport', 
                              'StreamingTV','StreamingMovies', 'PaperlessBilling', 
                              'PaymentMethod'],drop_first=True)
df1

## -> Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scale=MinMaxScaler()
df1['MonthlyCharges']=scale.fit_transform(df1[['MonthlyCharges']])
df1['tenure']=scale.fit_transform(df1[['tenure']])
df1

# Modelling
## -> Stratified Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X = df1.drop(['Churn'], axis=1) #features (independent variables)
y = df1['Churn'] #target (dependent variable)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    stratify= y, random_state=42)
print(y_train.value_counts(normalize= True))
print(y_test.value_counts(normalize= True))

In [None]:
X_test.info()

## -> Over Sampling the train set (SMOTE)

In [None]:
conda install -c conda-forge imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTENC
smotenc = SMOTENC([2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24],
                  random_state = 101)
X_oversample_train, y_oversample_train = smotenc.fit_resample(X_train, y_train)
print(y_oversample_train.value_counts(normalize= True))

## -> MODEL 1 : RANDOM FOREST

### ORIGINAL DATA

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report


clf_forest = RandomForestClassifier(n_estimators=100, max_depth=10,random_state=1)
clf_forest.fit(X_train,y_train)

pred = clf_forest.predict(X_train)
print('Training set accuracy on non - over sampled data : ',accuracy_score(y_train, pred))

pred_test = clf_forest.predict(X_test)
print('Test set accuracy on non - over sampled data : ',accuracy_score(y_test, pred_test))
print('\n')


print(classification_report(y_test,clf_forest.predict(X_test)))

print('\nConfusion matrix before over sampling:')
print(confusion_matrix(y_test, pred_test))

####################################################################################

plt.clf()
cm=confusion_matrix(y_test, pred_test)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()

### OVER SAMPLED DATA

In [None]:

clf_forest = RandomForestClassifier(n_estimators=100, max_depth=10,random_state=1)
clf_forest.fit(X_oversample_train,y_oversample_train)

pred = clf_forest.predict(X_oversample_train)
print('Training set accuracy on over sampled data : ',accuracy_score(y_oversample_train, pred))

pred_test = clf_forest.predict(X_test)
print('Test set accuracy on non - over sampled data : ',accuracy_score(y_test, pred_test))
print('\n')

print(classification_report(y_test,clf_forest.predict(X_test)))


print('\nConfusion matrix after over sampling:')
print(confusion_matrix(y_test, pred_test))

##########################################################

plt.clf()
cm=confusion_matrix(y_test, pred_test)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()

### Random Forest - Oversample Vs Original Data Results: 

#### Sensitivity for class 1 in Oversampled Data is 0.70 and Original Data is 0.52.
#### Hence we can conclude that after Oversampling we have got a better classification model with 70% sensitivity value, although the accurancy has gone down.
#### Our motive is to identify only the churing customers(class 1), therefore we need a better Sensitivity value but not the Accuracy and our motive has been achieved by Oversampling.

In [None]:
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

probs = clf_forest.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)


plt.figure(figsize=(15,8))
plt.title('RANDOM FOREST ROC CURVE')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
          

### Hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_rf = {'n_estimators' : [1000,1100,1200,1300,1400],
                'max_depth' : np.arange(12,30,2)}

rf = RandomForestClassifier(random_state=1,bootstrap=True)
rf_random_grid = GridSearchCV(rf,param_grid_rf,cv=5,verbose=1,scoring='recall')

rf_random_grid.fit(X_oversample_train,y_oversample_train)

y_pred_rf_tuned = rf_random_grid.predict(X_test)
y_pred_rf_tuned_prob = rf_random_grid.predict_proba(X_test)

print('Best Parametrs : ', rf_random_grid.best_params_)
print('Best Score : ', rf_random_grid.best_score_)


In [None]:

print(classification_report(y_test,y_pred_rf_tuned))


print('\nConfusion matrix after over sampling and hyper parameter tuning:')
print(confusion_matrix(y_test, y_pred_rf_tuned))

##########################################################

plt.clf()
cm=confusion_matrix(y_test, y_pred_rf_tuned)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()

#############################################################


preds = y_pred_rf_tuned_prob[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)


import matplotlib.pyplot as plt
plt.figure(figsize=(15,8))
plt.title('RANDOM FOREST ROC CURVE ')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


## -> MODEL 2 : LOGISTIC REGRESSION

### ORIGINAL DATA

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

logreg_pred = logreg.predict(X_train)
print('Training set accuracy on Non - over sampled data : ',accuracy_score(y_train, logreg_pred))

logreg_pred_test = logreg.predict(X_test)
print('Test set accuracy on Non - over sampled data : ',accuracy_score(y_test, logreg_pred_test))
print('\n')


print(classification_report(y_test,logreg.predict(X_test)))

print('Confusion matrix before over sampling:')
cm_logreg = confusion_matrix(y_test, logreg_pred_test)
print(cm_logreg)

############################################################################

plt.clf()
plt.imshow(cm_logreg, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm_logreg[i][j]))
plt.show()

### OVER SAMPLED DATA


In [None]:
logreg = LogisticRegression()
logreg.fit(X_oversample_train,y_oversample_train)

logreg_pred = logreg.predict(X_oversample_train)
print('Training set accuracy on over sampled data : ',accuracy_score(y_oversample_train, logreg_pred))

logreg_pred_test = logreg.predict(X_test)
print('Test set accuracy on Non - over sampled data : ',accuracy_score(y_test, logreg_pred_test))
print('\n')

print(classification_report(y_test,logreg.predict(X_test)))

print('Confusion matrix after over sampling:')
cm_logreg = confusion_matrix(y_test, logreg_pred_test)
print(cm_logreg)

############################################################################

plt.clf()
plt.imshow(cm_logreg, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm_logreg[i][j]))
plt.show()

### Logistic Regression - Oversample Vs Original Data Results: 

#### Sensitivity for class 1 in Oversampled Data is 0.72 and Original Data is 0.51.
#### Hence we can conclude that after Oversampling we have got a better classification model with 72% sensitivity value, although the accurancy has gone down.
#### Our motive is to identify only the churing customers(class 1), therefore we need a better Sensitivity value but not the Accuracy and our motive has been achieved by Oversampling.
    

In [None]:
import sklearn.metrics as metrics
probs = logreg.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)


import matplotlib.pyplot as plt
plt.figure(figsize=(15,8))
plt.title('LOGISTIC REGRESSION ROC CURVE')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()




### Hyper parameter tuning

In [None]:
#param_grid_L1 = {'penalty' : ['l1','l2'], 'C' : np.arange(.1,5,.1)}

param_grid = [{'C': [0.001, 0.1, 1.0], 'class_weight': [None, 'balanced'],
               'solver': ['newton-cg', 'lbfgs','liblinear', 'sag', 'saga']}]

logreg_tuned = LogisticRegression(max_iter=1000)
logreg_tuned_gs = GridSearchCV(logreg_tuned,param_grid,cv=5,verbose=1,scoring='recall')

logreg_tuned_gs.fit(X_oversample_train,y_oversample_train)

y_pred_logreg_tuned = logreg_tuned_gs.predict(X_test)
y_pred_logreg_tuned_prob = logreg_tuned_gs.predict_proba(X_test)

print('Best Parameters : ', logreg_tuned_gs.best_params_)

print(classification_report(y_test,y_pred_logreg_tuned))


print('\nConfusion matrix after over sampling and hyper parameter tuning:')
print(confusion_matrix(y_test, y_pred_logreg_tuned))

##########################################################

plt.clf()
cm=confusion_matrix(y_test, y_pred_logreg_tuned)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()

#############################################################

preds = y_pred_logreg_tuned_prob[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)


import matplotlib.pyplot as plt
plt.figure(figsize=(15,8))
plt.title('logistic regression ROC CURVE ')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


## -> MODEL 3 : SVM

### ORIGINAL DATA

In [None]:
from sklearn.svm import SVC
SVM = SVC(kernel='rbf', probability = True)
SVM.fit(X_train,y_train)

svm_pred = SVM.predict(X_train)
print('Training set accuracy on Non - over sampled data : ',accuracy_score(y_train, svm_pred))

svm_pred_test = SVM.predict(X_test)
print('Test set accuracy on Non - over sampled data : ',accuracy_score(y_test, svm_pred_test))
print('\n')

print(classification_report(y_test,SVM.predict(X_test)))

print('Confusion matrix before over sampling : ')
cm_svm = confusion_matrix(y_test, svm_pred_test)
print(cm_svm)

############################################################

plt.clf()
plt.imshow(cm_svm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm_svm[i][j]))
plt.show()

### OVER SAMPLED DATA

In [None]:
SVM = SVC(kernel='rbf', probability = True)
SVM.fit(X_oversample_train,y_oversample_train)

svm_pred = SVM.predict(X_oversample_train)
print('Training set accuracy on over sampled data : ',accuracy_score(y_oversample_train, svm_pred))

svm_pred_test = SVM.predict(X_test)
print('Test set accuracy on Non - over sampled data : ',accuracy_score(y_test, svm_pred_test))
print('\n')


print(classification_report(y_test,SVM.predict(X_test)))

print('Confusion matrix before over sampling : ')
cm_svm = confusion_matrix(y_test, svm_pred_test)
print(cm_svm)

############################################################


plt.clf()
plt.imshow(cm_svm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm_svm[i][j]))
plt.show()



### SVM - Oversample Vs Original Data Results: 

#### Sensitivity for class 1 in Oversampled Data is 0.69 and Original Data is 0.46.
#### Hence we can conclude that after Oversampling we have got a better classification model with 69% sensitivity value, although the accurancy has gone down.
#### Our motive is to identify only the churing customers(class 1), therefore we need a better Sensitivity value but not the Accuracy and our motive has been achieved by Oversampling.

In [None]:
import sklearn.metrics as metrics
probs = SVM.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)


import matplotlib.pyplot as plt
plt.figure(figsize=(15,8))
plt.title('SVM ROC CURVE')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()




### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = [{'C':[0.1,1,100,1000],'kernel':['rbf'],'degree':[1,2,3,4,5,6],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}]

svm_tuned = SVC(probability = True)
svm_tuned_gs = GridSearchCV(svm_tuned,param_grid,cv=5,verbose=1,scoring='recall')

svm_tuned_gs.fit(X_oversample_train,y_oversample_train)

y_pred_svm_tuned = svm_tuned_gs.predict(X_test)
y_pred_svm_tuned_prob = svm_tuned_gs.predict_proba(X_test)

print('Best Parameters : ', svm_tuned_gs.best_params_)

print(classification_report(y_test,y_pred_svm_tuned))


print('\nConfusion matrix after over sampling and hyper parameter tuning:')
print(confusion_matrix(y_test, y_pred_svm_tuned))

##########################################################

plt.clf()
cm=confusion_matrix(y_test, y_pred_svm_tuned)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()

#############################################################

preds = y_pred_svm_tuned_prob[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)


import matplotlib.pyplot as plt
plt.figure(figsize=(15,8))
plt.title('logistic regression ROC CURVE ')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


## -> MODEL 4 : KNN

### ORIGINAL DATA

In [None]:
df1 = pd.get_dummies(df, 
                     columns=['SeniorCitizen', 'Partner', 'Dependents',
                              'MultipleLines','InternetService','OnlineSecurity',
                              'OnlineBackup','DeviceProtection','TechSupport', 
                              'StreamingTV','StreamingMovies', 'PaperlessBilling', 
                              'PaymentMethod'],drop_first=False)

#numeric_tags = {'Yes':1, 'No':0}
#df1.Churn.replace(numeric_tags, inplace=True)
###########################################################
from sklearn.preprocessing import MinMaxScaler
scale=MinMaxScaler()
df1['MonthlyCharges']=scale.fit_transform(df1[['MonthlyCharges']])
df1['tenure']=scale.fit_transform(df1[['tenure']])
##################################################################
from sklearn.model_selection import train_test_split
X = df1.drop(['Churn'], axis=1) #features (independent variables)
y = df1['Churn'] #target (dependent variable)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    stratify= y, random_state=42)
print(y_train.value_counts(normalize= True))
print(y_test.value_counts(normalize= True))
####################################################################
from imblearn.over_sampling import SMOTENC
smotenc = SMOTENC([2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,
                  30,31,32,33,34,35,36,37],
                  random_state = 101)
X_oversample_train, y_oversample_train = smotenc.fit_resample(X_train, y_train)
print(y_oversample_train.value_counts(normalize= True))
####################################################################

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

knn_pred = knn.predict(X_train)
print('Training set accuracy on non-oversampled data : ',accuracy_score(y_train, knn_pred))

knn_pred_test = knn.predict(X_test)
print('Test set accuracy on non-oversampled data : ',accuracy_score(y_test, knn_pred_test))
print('\n')

print(classification_report(y_test,knn.predict(X_test)))
print('\n')

cm_knn = confusion_matrix(y_test, knn_pred_test)
print('Confusion Matrix before oversampling :')
print(cm_knn)

#################################################################

plt.clf()
plt.imshow(cm_knn, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm_knn[i][j]))
plt.show()



### OVER SAMPLED DATA

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_oversample_train,y_oversample_train)

knn_pred = knn.predict(X_oversample_train)
print('Training set accuracy on oversampled data : ',accuracy_score(y_oversample_train, knn_pred))

knn_pred_test = knn.predict(X_test)
print('Test set accuracy on non-oversampled data : ',accuracy_score(y_test, knn_pred_test))


print(classification_report(y_test,knn.predict(X_test)))
print('\n')

print('Confusion matrix after oversampliing :')
cm_knn = confusion_matrix(y_test, knn_pred_test)
print(cm_knn)


##############################################################

plt.clf()
plt.imshow(cm_knn, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm_knn[i][j]))
plt.show()



### KNN - Oversample Vs Original Data Results: 

#### Sensitivity for class 1 in Oversampled Data is 0.68 and Original Data is 0.46.
#### Hence we can conclude that after Oversampling we have got a better classification model with 68% sensitivity value, although the accurancy has gone down.
#### Our motive is to identify only the churing customers(class 1), therefore we need a better Sensitivity value but not the Accuracy and our motive has been achieved by Oversampling.

In [None]:
import sklearn.metrics as metrics
probs = knn.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)


import matplotlib.pyplot as plt
plt.figure(figsize=(15,8))
plt.title('KNN ROC CURVE')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()



### Hyper parameter tuning

In [None]:
param_grid = {'n_neighbors':np.arange(1,30)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn,param_grid,cv=5,verbose=1,scoring='recall')
knn_cv.fit(X_oversample_train,y_oversample_train)

y_pred_knn_tuned = knn_cv.predict(X_test)
y_pred_knn_tuned_prob = knn_cv.predict_proba(X_test)

print('Best Parameters : ',knn_cv.best_params_)



In [None]:
print(classification_report(y_test,y_pred_knn_tuned))


print('\nConfusion matrix after over sampling and hyper parameter tuning:')
print(confusion_matrix(y_test, y_pred_knn_tuned))

##########################################################

plt.clf()
cm=confusion_matrix(y_test, y_pred_knn_tuned)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0','1']
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()
#############################################################




# -> COST CALCULATION


### cost calculation -> Logistic regression

In [None]:
from sklearn.model_selection import KFold,cross_val_predict

logistic_gain = []
i_vals = []
cost = []
savings = []
i = .01

logreg_tuned_gs.fit(X_oversample_train,y_oversample_train)
pred = logreg_tuned_gs.predict(X_test)

for i in np.linspace(0, 1, 101):
    folds = KFold(n_splits=5, shuffle=True)
    probs = cross_val_predict(logreg_tuned_gs.best_estimator_, X_train, y_train, cv=folds, 
                              method='predict_proba', n_jobs=-1)
    probs = pd.DataFrame(probs)
    new_pred = probs[1].apply(lambda x: 1 if x > i else 0)
    conf = confusion_matrix(y_train, new_pred)

    total_cost = (conf[0][1] * 100) + (conf[1][1] * 100) + (conf[1][0] * 500)
    total_savings = conf[1][1] * 500

    net_gain = total_savings - total_cost
    logistic_gain.append(net_gain)
    i_vals.append(i)
    cost.append(total_cost)
    savings.append(total_savings)

print(f'Max net gain = {max(logistic_gain)}')

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(i_vals, logistic_gain, label='Logistic Regression', color='green', lw=2)
ax.axhline(y=0, color='black')
ax.set_xlabel('Probability Threshold', fontsize=16, labelpad=10)
ax.set_ylabel('$ Saved', rotation=90, fontsize=16, labelpad=10)
ax.set_yticklabels(['-100k', '0', '100k', '200k', '300k'], fontsize=12)
ax.set_xticklabels(['0', '0.1', '0.2', '0.3', '0.4', '0.5'], fontsize=12)
ax.set_yticks([-100000, 0, 100000, 200000, 300000])
ax.set_xbound(lower=0, upper=0.5)
ax.set_ybound(lower=-100000, upper=300000)

### cost calculation -> KNN


In [None]:

logistic_gain = []
i_vals = []
cost = []
savings = []
i = .01

knn_cv.fit(X_oversample_train,y_oversample_train)
pred = knn_cv.predict(X_test)

for i in np.linspace(0, 1, 101):
    folds = KFold(n_splits=5, shuffle=True)
    probs = cross_val_predict(knn_cv.best_estimator_, X_train, y_train, cv=folds, 
                              method='predict_proba', n_jobs=-1)
    probs = pd.DataFrame(probs)
    new_pred = probs[1].apply(lambda x: 1 if x > i else 0)
    conf = confusion_matrix(y_train, new_pred)

    total_cost = (conf[0][1] * 100) + (conf[1][1] * 100) + (conf[1][0] * 500)
    total_savings = conf[1][1] * 500

    net_gain = total_savings - total_cost
    logistic_gain.append(net_gain)
    i_vals.append(i)
    cost.append(total_cost)
    savings.append(total_savings)

print(f'Max net gain = {max(logistic_gain)}')

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(i_vals, logistic_gain, label='KNN', color='green', lw=2)
ax.axhline(y=0, color='black')
ax.set_xlabel('Probability Threshold', fontsize=16, labelpad=10)
ax.set_ylabel('$ Saved', rotation=90, fontsize=16, labelpad=10)
ax.set_yticklabels(['-100k', '0', '100k', '200k', '300k'], fontsize=12)
ax.set_xticklabels(['0', '0.1', '0.2', '0.3', '0.4', '0.5'], fontsize=12)
ax.set_yticks([-100000, 0, 100000, 200000, 300000])
ax.set_xbound(lower=0, upper=0.5)
ax.set_ybound(lower=-100000, upper=300000)

### WHEN WE THINK NO CUSTOMERS WILL CHURN

In [None]:
y_train.shape

In [None]:
new_pred_zeros = np.zeros(shape=(5634,))

conf = confusion_matrix(y_train, new_pred_zeros)

total_cost = (conf[0][1] * 100) + (conf[1][1] * 100) + conf[1][0] * 500
total_savings = conf[1][1] * 500

net_gain = total_savings - total_cost
print(net_gain)

In [None]:
conf

### WHEN WE THINK ALL CUSTOMERS WILL CHURN

In [None]:
new_pred_ones = np.ones(shape=(5634,))

conf = confusion_matrix(y_train, new_pred_ones)

total_cost = (conf[0][1] * 100) + (conf[1][1] * 100) + conf[1][0] * 500
total_savings = conf[1][1] * 500

net_gain = total_savings - total_cost
print(net_gain)

In [None]:
conf