In [None]:
# Importing required packages.

import numpy as np                  # Mathetimatical Operations
import pandas as pd                 # Data manipulation
import seaborn as sns
import matplotlib.pyplot as plt     # Used for plotting graphs.
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import preprocessing

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
# Importing Dataset
df = pd.read_csv('F:/LBSIM/FRP/Dataset/Churn Prediction in Telecom Industry/430-AmitGupta.csv')
df.head() # shows top 5 rows

In [None]:
print("------  Data type Count  ----- \n",df.dtypes.value_counts())
cate = [key for key in dict(df.dtypes) if dict(df.dtypes)[key] in ['bool', 'object']]
le = preprocessing.LabelEncoder()
for i in cate:
    le.fit(df[i])
    df[i] = le.transform(df[i])

df.dtypes.value_counts()

In [None]:
# Performing Univariate analysis
print('Total number of observations in the dataset are:',df.shape[0]) # Shape() function returns the dimensions of the array. 
df.info() # Gives the structure of the data w.r.t. different columns.
#pd.set_option('display.expand_frame_repr', False)
#df.describe()

# Plotting the classes for churn and did not churn
count_classes = pd.value_counts(df['churn'], sort = True)  # This gives the different set of values that the column 'apply' can take. Also, it plots the graph to show the counts of rows having both the values.
count_classes.plot(kind = 'bar')
plt.title("Churn Rate")
plt.xticks(range(2))
plt.xlabel("Class")
plt.ylabel("Frequency");

# Churn variable analysis
df['churn'].value_counts()
print('Count of number of customers who didnt churn:',df['churn'].value_counts()[0])
print('Count of number of customers who churned:',df['churn'].value_counts()[1])
percent_churn = df['churn'].value_counts()[0]/df['churn'].value_counts()[1]
percent_churn = "{0:.2f}".format(percent_churn)
percent_churn
print('As per the data, the percentage of people who have churned in proportion to the people who didnt churn is ',percent_churn,'%')

df_temp = df.copy() # For plotting the correlation matrix
df_temp.info()
df_temp.drop(["state","churn"], axis = 1, inplace = True) 

In [None]:
# Checking the correlation between the features
f, ax = plt.subplots(figsize=(12, 10))
corr = df_temp.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="Reds",fmt='.2f',
linewidths=.05)
f.subplots_adjust(top=0.93)
t= f.suptitle('Variable Correlation Heatmap', fontsize=14)
# Insights: Some of the highly correlated variables ares:
# 1. total_day_minutes and total_day_charge - 1.0
# 2. total_eve_minutes and total_eve_charge - 1.0
# 3. total_night_minutes and total_night_charge -1.0
# 4. total_intl_minutes and total_intl_charge - 1.0
# 5. voice_mail_plan_yes and number_vmail_messages - 0.96

In [None]:
# Checking the feature importance
X = df.iloc[:,1:20]  #independent columns
y = df.iloc[:,-1]    #target column i.e churn 
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(19).plot(kind='barh')
plt.show()
# Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable.
# Using feature importance and the correlation matrix, for the correlated variables, we can remove the variable which is less important

In [None]:
# Removing number_vmail_messages, total_day_minutes, total_eve_minutes, total_night_minutes, total_intl_minutes
df_original=df.copy()
df_original.info()
df.drop(["number_vmail_messages","total_day_charge","total_eve_charge","total_night_charge","total_intl_charge"], axis = 1, inplace = True) 
df.info()

In [None]:
# Null value detection
def missing_data(df):
    """df: panda data frame"""
    total = df.isnull().sum().sort_values(ascending=False)
    percent = total / len(df) 
    return pd.concat([total,percent], axis=1, keys =['Total', 'Percent'])

missing_data(df) # No missing values in the original data. Only missing or null values for the binned variables created.
df.columns

In [None]:
# Univariate Analysis For Numeric Variables
# Getting Five number summary for all variables
df.iloc[:,1:6].describe()
df.iloc[:,6:11].describe()
df.iloc[:,11:16].describe()

# Univariate Analysis For categorical variable
df.state.value_counts() # Getting the counts for records state wise.
df.international_plan_yes.value_counts()
df.voice_mail_plan_yes.value_counts()
sns.distplot(df['total_intl_minutes'], kde=False)

In [None]:
#  Feature Engineering
df_bivariate =df.copy()
df_bivariate.info()

# Interval Creation
df_bivariate['total_day_calls_binned'] = pd.cut(x=df_bivariate['total_day_calls'], bins=[1,30,45,60,75,90,105,120,135,150,165], 
            labels=['Upto 30', '30-45','45-60','60-75','75-90','90-105','105-120','120-135','135-150','150-165'])
df_bivariate['total_day_calls_binned'].value_counts()

df_bivariate['total_eve_calls_binned'] = pd.cut(x=df_bivariate['total_eve_calls'], bins=[1,30,45,60,75,90,105,120,135,150,170], 
            labels=['Upto 30', '30-45','45-60','60-75','75-90','90-105','105-120','120-135','135-150','150-170'])
df_bivariate['total_eve_calls_binned'].value_counts()

df_bivariate['total_night_calls_binned'] = pd.cut(x=df_bivariate['total_night_calls'], bins=[1,45,60,75,90,105,120,135,150,165,180], 
            labels=['Upto 45','45-60','60-75','75-90','90-105','105-120','120-135','135-150','150-165','165-180'])
df_bivariate['total_night_calls_binned'].value_counts()

df_bivariate['total_day_minutes_binned'] = pd.cut(x=df_bivariate['total_day_minutes'], bins=[0,50,100,150,200,250,300,351], 
            labels=['Upto 50', '50-100', '100-150','150-200','200-250','250-300','300-350',])
df_bivariate['total_day_minutes_binned'].value_counts()

df_bivariate['total_night_minutes_binned'] = pd.cut(x=df_bivariate['total_night_minutes'], bins=[0,50,100,150,200,250,300,350,400], 
            labels=['Upto 50', '50-100', '100-150','150-200','200-250','250-300','300-350','350-400'])
df_bivariate['total_night_minutes_binned'].value_counts()

df_bivariate['total_eve_minutes_binned'] = pd.cut(x=df_bivariate['total_eve_minutes'], bins=[0,50,100,150,200,250,300,350,400], 
            labels=['Upto 50', '50-100', '100-150','150-200','200-250','250-300','300-350','350-400'])
df_bivariate['total_eve_minutes_binned'].value_counts()

df_bivariate['total_intl_minutes_binned'] = pd.cut(x=df_bivariate['total_intl_minutes'], bins=[0,5,10,15,20], 
            labels=["Upto 5", '5-10', '10-15','15-20'])
df_bivariate['total_intl_minutes_binned'].value_counts()

In [None]:
# Bivariate Analysis
# Cross Tabs with Churn variable
pd.crosstab(df_bivariate["total_day_calls_binned"],df_bivariate["churn"])
pd.crosstab(df_bivariate["total_eve_calls_binned"],df_bivariate["churn"])
pd.crosstab(df_bivariate["total_night_calls_binned"],df_bivariate["churn"])
pd.crosstab(df_bivariate["total_intl_calls_binned"],df_bivariate["churn"])
pd.crosstab(df_bivariate["total_day_minutes_binned"],df_bivariate["churn"])
pd.crosstab(df_bivariate["total_eve_minutes_binned"],df_bivariate["churn"])
pd.crosstab(df_bivariate["total_night_minutes_binned"],df_bivariate["churn"])
pd.crosstab(df_bivariate["total_intl_minutes_binned"],df_bivariate["churn"])

pd.crosstab(df_bivariate["account_length"],df_bivariate["churn"])

pd.crosstab(df_bivariate["area_code"],df["churn"])
pd.crosstab(df_bivariate["international_plan_yes"],df_bivariate["churn"])
pd.crosstab(df_bivariate["voice_mail_plan_yes"],df_bivariate["churn"])
pd.crosstab(df_bivariate["number_vmail_messages"],df_bivariate["churn"])
pd.crosstab(df_bivariate["customer_service_calls"],df_bivariate["churn"])

In [None]:
#Model Building

temp2 = ['churn']
#Now seperate the dataset as response variable and feature variabes
X = df.drop(temp2, axis = 1)
#X = wine.drop('quality', axis = 1)
y = df['churn']
X.info()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()

# Recursive Feature Elimination (Taking all features and then reducing)
from sklearn.feature_selection import RFE  # Recursive Feature Elimination
from sklearn.linear_model import LogisticRegression
rfe = RFE(logreg, 15)
rfe = rfe.fit(X,y)
print(rfe.support_)
print(rfe.ranking_)
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing state variable
cols=['state']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing account_length variable
cols=['account_length']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing total_day_calls variable
cols=['total_day_calls']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing total_eve_calls variable
cols=['total_eve_calls']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing total_night_calls variable
cols=['total_night_calls']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing total_night_minutes variable
cols=['total_night_minutes']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

In [None]:
# Recursive Feature Elimination (Taking top 10 features and then reducing)
rfe = RFE(logreg,  )
rfe = rfe.fit(X,y)
print(rfe.support_)
print(rfe.ranking_)
X.columns
cols=['international_plan_yes', 'voice_mail_plan_yes','total_intl_minutes','total_intl_calls','customer_service_calls']
X = X.drop(cols, axis = 1)
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing state variable
cols=['state']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing account_length variable
cols=['account_length']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing total_day_calls variable
cols=['total_day_calls']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing total_night_minutes variable
cols=['total_night_minutes']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing total_eve_calls variable
cols=['total_eve_calls']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
#Removing total_night_calls variable
cols=['total_night_calls']
X = X.drop(cols, axis = 1)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

In [None]:
#Train and Test splitting of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42)
      
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Logistic Regression

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# ROC Curve for Logistic Regression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Churn Rate Prediction')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# Support Vector Classifier
svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)
print(classification_report(y_test, pred_svc))

In [None]:
# XGBoost
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
#labels = ['No Apply', 'Apply']
predictions = [round(value) for value in y_pred]
# evaluate predictions 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# LightGBM
import lightgbm as lgb

model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
#labels = ['No Apply', 'Apply']
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))