In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib as mlt
import seaborn as sns
import scipy
import matplotlib.pyplot as plt
import os
from string import ascii_letters
from pylab import rcParams

In [None]:
# Load Libraries
from fancyimpute import KNN
from scipy.stats import chi2_contingency
from random import randrange, uniform
from sklearn.model_selection import train_test_split

In [None]:
# Set working directory
os.chdir("/Users/ad/Desktop/Project 1")

# Check working directory
os.getcwd()

In [None]:
# Load Data to python
train = pd.read_csv("Train_data.csv")
test = pd.read_csv("Test_data.csv")

In [None]:
data = train.append(test)

# Exploratory Data Analysis

In [None]:
data.head(5)

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
# Assigning codes to each state

keys = data['state'].unique().tolist()
values = list(range(len(keys)))
state_codes = dict(zip(keys,values))
data['state'] = data['state'].map(state_codes)

In [None]:
# Separate Continuous and Categorical Variables
# Excluding phone.number variable and dependent variable

cnames= ["account.length","number.vmail.messages", "total.day.minutes","total.day.calls", "total.day.charge",
         "total.eve.minutes","total.eve.calls","total.eve.charge","total.night.minutes", 
         "total.night.calls","total.night.charge","total.intl.minutes","total.intl.calls", "total.intl.charge", 
         "number.customer.service.calls"]

cat_names= ["state", "area.code", "international.plan","voice.mail.plan"]

In [None]:
# Assigning levels to the categories
lis = []
for i in range(0, data.shape[1]):
    #print(i)
    if(data.iloc[:,i].dtypes == 'object'):
        data.iloc[:,i] = pd.Categorical(data.iloc[:,i])
        #print(data.iloc[i])
        data.iloc[:,i] = data.iloc[:,i].cat.codes 
        data.iloc[:,i] = data.iloc[:,i].astype('object')
        
        lis.append(data.columns[i])

In [None]:
## Checking correlations values of continous variables
corr = data.corr()
corr.style.background_gradient()

In [None]:
## Checking correlations of continous variables
# Correlation Plot
df_corr = data.loc[:,cnames]

corr = df_corr.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
# Checking dependency of dependent variable on categorical variables
# Loop for chi square values
# Variable area.code not significant

for i in cat_names:
    print(i)
    chi2, p, dof, ex = chi2_contingency(pd.crosstab(data['Churn'], data[i]))
    print(p)

In [None]:
#Checking counts of Target variable
plt.figure(figsize = (9,6)),
sns.set(font_scale = 1),
sns.countplot(x = 'Churn', data = data)
plt.xlabel ( 'Churn' , fontsize = 20)
plt.ylabel ( 'Counts' , fontsize = 20)

In [None]:
# Check Number of Voicemail Messages by Class

plt.figure(figsize = (10,15))
data.hist('number.vmail.messages', by = 'Churn') 
plt.ylabel('Count' , fontsize = 20)

In [None]:
# Plot total.intl.calls by class

plt.figure(figsize = (10,15))
data.hist('total.intl.calls', by = 'Churn') 
plt.ylabel('Count' , fontsize = 20)

In [None]:
# Plot number.customer.service.calls by class

plt.figure(figsize = (10,15))
data.hist('number.customer.service.calls', by = 'Churn') 
plt.ylabel('Count' , fontsize = 20)

In [None]:
# Plot of States
plt.figure(figsize = (15,10))
sns.countplot('state', data= data) 
plt.xlabel('State',fontsize = 20)
plt.ylabel('Count',fontsize = 20)

# MISSING VALUE ANALYSIS

In [None]:
# Checking Missing Values

missing_val= pd.DataFrame(data.isnull().sum())
missing_val 
#No missing values

# Outlier Analysis

In [None]:
# Boxplot to visualize outliers

%matplotlib inline

plt.boxplot(data['total.intl.minutes'])

In [None]:
# Detect and delete outliers from data
for i in cnames:
    print(i)
    q75, q25 = np.percentile(data.loc[:,i], [75, 25])
    iqr = q75 - q25
    
    min = q25 - (iqr * 1.5)
    max = q75 + (iqr * 1.5)
    
    print(min)
    print(max)
    data.loc[data.loc[:,i] < min,i]= np.nan
    data.loc[data.loc[:,i] > max,i]= np.nan

In [None]:
# Calculate the missing values
missing_val= pd.DataFrame(data.isnull().sum())
print(data.isnull().any())

In [None]:
missing_val

In [None]:
# Impute missing values with KNN
data = pd.DataFrame(KNN(k = 3).fit_transform(data), columns = data.columns)

# Feature Selection

In [None]:
data_copy = data.copy()
#data = data_copy

In [None]:
# Drop Variables

data = data.drop(["phone.number","area.code","total.day.charge","total.eve.charge","total.night.charge","total.intl.charge"], axis= 1)

In [None]:
data.shape

# Feature Scaling

In [None]:
data.head()

In [None]:
# Normality Check
%matplotlib inline
plt.hist(data['number.customer.service.calls'], bins='auto')

In [None]:
not_norm = ["number.vmail.messages","number.customer.service.calls"]

In [None]:
# Normalisation
for i in not_norm:
    print(i)
    data[i] = (data[i] - data[i].min())/ (data[i].max()- data[i].min())

In [None]:
data.shape

In [None]:
var_norm = ["account.length","total.day.minutes", "total.day.calls", "total.eve.minutes", "total.eve.calls","total.night.minutes","total.night.calls","total.intl.minutes"]

In [None]:
# Standardization
for i in var_norm:
    print(i)
    data[i]= (data[i]- data[i].mean())/data[i].std()

In [None]:
data.shape

# MODELLING

In [None]:
# Divide the data into train and test
from sklearn.model_selection import train_test_split

X = data.values[:, 0:14] # Independent Variable
Y = data.values[:,14] # Dependent Variable

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
# Oversampling of minor class target variable
from imblearn.over_sampling import SMOTE
smote = SMOTE()
print("Before oversampling, count of class '1': {}".format(sum(Y_train == 1)))
print("Before oversampling, count of class '0': {}".format(sum(Y_train == 0)))

In [None]:
X_train_over, Y_train_over = smote.fit_sample(X_train, Y_train.ravel())

In [None]:
print("After oversampling, count of class '1': {}".format(sum(Y_train_over == 1)))
print("After oversampling, count of class '0': {}".format(sum(Y_train_over == 0)))

In [None]:
print("Before oversampling, count of class '1': {}".format(sum(Y_test == 1)))
print("Before oversampling, count of class '0': {}".format(sum(Y_test == 0)))

In [None]:
X_test_over, Y_test_over = smote.fit_sample(X_test, Y_test.ravel())

In [None]:
print("After oversampling, count of class '1': {}".format(sum(Y_test_over == 1)))
print("After oversampling, count of class '0': {}".format(sum(Y_test_over == 0)))

# Decision Tree

In [None]:
#Import Libraries
from sklearn import tree
from sklearn.metrics import accuracy_score

In [None]:
clf = tree.DecisionTreeClassifier(criterion= 'entropy').fit(X_train_over, Y_train_over)

In [None]:
# Predict new test cases
y_pred = clf.predict(X_test_over)

In [None]:
# Create dot file to visualize tree ( http://webgraphviz.com/)
#dotfile= open("pt.dot", 'w')
#df = tree.export_graphviz(clf, out_file= dotfile, feature_names= data.columns)

In [None]:
# Checking the accuracy
accuracy_score(Y_test_over, y_pred)*100

In [None]:
# Build the Confusion Matrix
from sklearn.metrics import confusion_matrix
CM = confusion_matrix(Y_test_over, y_pred)

In [None]:
CM

In [None]:
# Build Confusion Matrix
CM = pd.crosstab(Y_test_over, y_pred)

# Store TP,TN,FP,FN values
TN = CM.iloc[0,0]
FN = CM.iloc[1,0]
TP = CM.iloc[1,1]
FP = CM.iloc[0,1]

# Accuracy
#(TN+TP)/(TN+TP+FN+FP)

# FNR
(FN*100)/(FN+TP)

# Recall
(TP*100)/(TP+FN)

# Specificity
(TN*100)/(TN+FP)

# Random Forest

In [None]:
# Import Libraries
from sklearn.ensemble import RandomForestClassifier

RF_model= RandomForestClassifier(n_estimators = 500).fit(X_train_over, Y_train_over)

In [None]:
RF_predictions = RF_model.predict(X_test_over)

In [None]:
# Build Confusion Matrix

CM = pd.crosstab(Y_test_over, RF_predictions)

# Store TP,TN,FP,FN values

TN = CM.iloc[0,0]
FN = CM.iloc[1,0]
TP = CM.iloc[1,1]
FP = CM.iloc[0,1]

# Check Accuracy of the model
#((TN+TP)*100)/(TN+FN+TP+FP)

# Check FNR
#(FN*100)/(FN+TP)

# Recall
#(TP*100)/(TP+FN)

# Specificity
#(TN*100)/(TN+FP)

# Logistic Regression

In [None]:
# Replace Target Variable with 0 and 1
data['Churn']= data['Churn'].replace('No', 0)
data['Churn']= data['Churn'].replace('Yes', 1)

In [None]:
data_logit= pd.DataFrame(data['Churn'])

In [None]:
data_logit.shape

In [None]:
cnames= ["account.length", "number.vmail.messages", "total.day.minutes","total.day.calls",
         "total.eve.minutes","total.eve.calls","total.night.minutes", 
         "total.night.calls","total.intl.minutes","total.intl.calls", 
         "number.customer.service.calls"]

In [None]:
# Add continuous variable
data_logit = data_logit.join(data[cnames])

In [None]:
# Create dummies for categorical variables
cat_names= ["international.plan","voice.mail.plan"]

for i in cat_names:
    temp = pd.get_dummies(data[i], prefix = i)
    data_logit = data_logit.join(temp)

In [None]:
data_logit.shape

In [None]:
sample_index = np.random.rand(len(data_logit)) < 0.8

train = data_logit[sample_index]
test = data_logit[~sample_index]

In [None]:
# Select columns indexes for independent variables
train_cols = train.columns[1:17]

In [None]:
train_cols

In [None]:
# Build logistic regression model
import statsmodels.api as sm

logit = sm.Logit(train['Churn'], train[train_cols]).fit()

In [None]:
logit.summary()

In [None]:
# Predict Test Data
test['Actual_prob'] = logit.predict(test[train_cols])

In [None]:
test.head()

In [None]:
test['Actualval'] = 1
test.loc[test.Actual_prob < 0.5, 'Actualval'] = 0

In [None]:
# Build Confusion Matrix

CM = pd.crosstab(test['Churn'], test['Actualval'])

# Store TP,TN,FP,FN values

TN = CM.iloc[0,0]
FN = CM.iloc[1,0]
TP = CM.iloc[1,1]
FP = CM.iloc[0,1]

# Check Accuracy of the model
#((TN+TP)*100)/(TN+FN+TP+FP)

# Check FNR
#(FN*100)/(FN+TP)

# Recall
#(TP*100)/(TP+FN)

# Specificity
#(TN*100)/(TN+FP)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

KNN_model = KNeighborsClassifier(n_neighbors = 11).fit(X_train_over, Y_train_over)

In [None]:
# Predict the test cases
KNN_pred = KNN_model.predict(X_test_over)

In [None]:
# Build Confusion Matrix

CM = pd.crosstab(Y_test_over, KNN_pred)

# Store TP,TN,FP,FN values

TN = CM.iloc[0,0]
FN = CM.iloc[1,0]
TP = CM.iloc[1,1]
FP = CM.iloc[0,1]

# Check Accuracy of the model
((TN+TP)*100)/(TN+FN+TP+FP)


# Check FNR
#(FN*100)/(FN+TP)

# Recall
#(TP*100)/(TP+FN)

# Specificity
#(TN*100)/(TN+FP)

Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

# Build Naive Bayes model
NB_model = GaussianNB().fit(X_train_over, Y_train_over)

In [None]:
# Predict the test cases
NB_pred = NB_model.predict(X_test_over)

In [None]:
# Build Confusion Matrix

CM = pd.crosstab(Y_test_over, NB_pred)

# Store TP,TN,FP,FN values

TN = CM.iloc[0,0]
FN = CM.iloc[1,0]
TP = CM.iloc[1,1]
FP = CM.iloc[0,1]

# Check Accuracy of the model
#((TN+TP)*100)/(TN+FN+TP+FP)


# Check FNR
#(FN*100)/(FN+TP)

# Recall
#(TP*100)/(TP+FN)

# Specificity
#(TN*100)/(TN+FP)