# Telecom Churn Case Study

## Step 1: Importing and Merging Data

In [1]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [None]:
# Importing all datasets
churn_data = pd.read_csv("churn_data.csv")
churn_data.head()

In [None]:
churn_data.shape

In [None]:
customer_data = pd.read_csv("customer_data.csv")
customer_data.head()

In [None]:
customer_data.shape

In [None]:
internet_data= pd.read_csv('internet_data.csv')
internet_data.head()

In [None]:
internet_data.shape

#### If you have analysis the data the first column which is customerID tops 5 rows are sane in all the data set so out next step will merger the data and create single data set

## Combining all data files into one consolidated dataframe

In [None]:
# Merging on 'customerID'
df_1 = pd.merge(churn_data, customer_data, how='inner', on='customerID')

In [None]:
# Final dataframe with all predictor variables
telecom = pd.merge(df_1, internet_data, how='inner', on='customerID')

## Step 2: Inspecting the Dataframe

In [None]:
telecom.head()

In [None]:
telecom.shape

In [None]:
telecom.describe()

## Step 3: Data Preparation

In [None]:
# Let's see the type of each column
telecom.info()

In [None]:
# List of variables to map

varlist =  ['PhoneService', 'PaperlessBilling', 'Churn', 'Partner', 'Dependents']

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function to the housing list
telecom[varlist] = telecom[varlist].apply(binary_map)

In [None]:
telecom.head()

### For categorical variable with multiple levels, create dummy features (on-hot encoded)

In [None]:
# Creating a dummy variable for some of the categorical variables and dropping the first one.
dummy1 = pd.get_dummies(telecom[['Contract', 'PaymentMethod', 'gender', 'InternetService']], drop_first=True)

# Adding the results to the master dataframe
telecom = pd.concat([telecom, dummy1], axis=1)

In [None]:
telecom.head()

In [None]:
# Creating dummy variables for the remaining categorical variables and dropping the level with big names.

# Creating dummy variables for the variable 'MultipleLines'
ml = pd.get_dummies(telecom['MultipleLines'], prefix='MultipleLines')
# Dropping MultipleLines_No phone service column
ml1 = ml.drop(['MultipleLines_No phone service'], 1)
#Adding the results to the master dataframe
telecom = pd.concat([telecom,ml1], axis=1)

# Creating dummy variables for the variable 'OnlineSecurity'.
os = pd.get_dummies(telecom['OnlineSecurity'], prefix='OnlineSecurity')
os1 = os.drop(['OnlineSecurity_No internet service'], 1)
# Adding the results to the master dataframe
telecom = pd.concat([telecom,os1], axis=1)

# Creating dummy variables for the variable 'OnlineBackup'.
ob = pd.get_dummies(telecom['OnlineBackup'], prefix='OnlineBackup')
ob1 = ob.drop(['OnlineBackup_No internet service'], 1)
# Adding the results to the master dataframe
telecom = pd.concat([telecom,ob1], axis=1)

# Creating dummy variables for the variable 'DeviceProtection'. 
dp = pd.get_dummies(telecom['DeviceProtection'], prefix='DeviceProtection')
dp1 = dp.drop(['DeviceProtection_No internet service'], 1)
# Adding the results to the master dataframe
telecom = pd.concat([telecom,dp1], axis=1)

# Creating dummy variables for the variable 'TechSupport'. 
ts = pd.get_dummies(telecom['TechSupport'], prefix='TechSupport')
ts1 = ts.drop(['TechSupport_No internet service'], 1)
# Adding the results to the master dataframe
telecom = pd.concat([telecom,ts1], axis=1)

# Creating dummy variables for the variable 'StreamingTV'.
st =pd.get_dummies(telecom['StreamingTV'], prefix='StreamingTV')
st1 = st.drop(['StreamingTV_No internet service'], 1)
# Adding the results to the master dataframe
telecom = pd.concat([telecom,st1], axis=1)

# Creating dummy variables for the variable 'StreamingMovies'. 
sm = pd.get_dummies(telecom['StreamingMovies'], prefix='StreamingMovies')
sm1 = sm.drop(['StreamingMovies_No internet service'], 1)
# Adding the results to the master dataframe
telecom = pd.concat([telecom,sm1], axis=1)

In [None]:
telecom.info()

In [None]:
telecom.head()

In [None]:
# We have created dummies for the below variables, so we can drop them
telecom = telecom.drop(['Contract','PaymentMethod','gender','MultipleLines','InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies'], 1)

In [None]:
telecom.head()

In [None]:
# Convert the 'TotalCharges' column from string to float
telecom['TotalCharges'] = pd.to_numeric(telecom['TotalCharges'], errors='coerce')

In [None]:
telecom.info()

#### Checking Outliers

In [None]:
# Checking for outliers in the continuous variables
num_telecom = telecom[['tenure','MonthlyCharges','SeniorCitizen','TotalCharges']]

In [None]:
# Checking outliers at 25%, 50%, 75%, 90%, 95% and 99%
num_telecom.describe(percentiles=[.25, .5, .75, .90, .95, .99])

From the distribution shown above, you can see that there no outliers in the dataset. the number should be increasing.

#### Checking for Missing Values and Inputing Them

In [None]:
# Adding the up the missing values
telecom.isnull().sum()

it means that is 11/7043=0.00156183444 i.e 0.1% best is to remove these observation form analysis

In [None]:
# Checking the prcentage of missing value
round(100*(telecom.isnull().sum()/len(telecom.index)),2)

In [None]:
# Removing NaN TotalCharges rows
telecom = telecom[~np.isnan(telecom['TotalCharges'])]

Now we don't have missing value.

## Step 4: Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Putting feature variable to X
X = telecom.drop(['Churn','customerID'], axis=1)

X.head()

In [None]:
# Putting response variable to y
y = telecom['Churn']

y.head()

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

## Step 5: Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

X_train[['tenure','MonthlyCharges','TotalCharges']] = scaler.fit_transform(X_train[['tenure','MonthlyCharges','TotalCharges']])

X_train.head()

In [None]:
### Checking the Churn Rate
churn = (sum(telecom['Churn'])/len(telecom['Churn'].index))*100
churn

We have almost 27% churn rate

## Step 6: Looking at Correlations

In [None]:
# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Let's see the correlation matrix 
plt.figure(figsize = (20,10))        # Size of the figure
sns.heatmap(telecom.corr(),annot = True)
plt.show()

#### Dropping highly correlated dummy variables

In [None]:

X_test = X_test.drop(['MultipleLines_No','OnlineSecurity_No','OnlineBackup_No','DeviceProtection_No','TechSupport_No',
                       'StreamingTV_No','StreamingMovies_No'], 1)
X_train = X_train.drop(['MultipleLines_No','OnlineSecurity_No','OnlineBackup_No','DeviceProtection_No','TechSupport_No',
                         'StreamingTV_No','StreamingMovies_No'], 1)

In [None]:
X_test.columns

#### Checking the Correlation Matrix

After dropping the highly correlated variable now let's check the correlation matrix

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(X_train.corr(),annot=True)
plt.show()

## Step 7: Model Building

Let's start by splitting our data into a training set and a test set.

### Running Your First Training Model

In [None]:
import statsmodels.api as sm

In [None]:
# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

## Step 8: Feature Selection Using RFE

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(estimator=logreg, n_features_to_select=15)            # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)
      # running RFE with 13 variables as output


In [None]:
rfe.support_

In [None]:
list(zip(X_train.columns,rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train.columns[~rfe.support_]

#### Assessing the model with StatsModels

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

#### Creating a dataframe with the actual churn flag and the predicted probabilities

In [None]:
y_train_pred_final = pd.DataFrame({'Churn':y_train.values, 'Churn_Prob':y_train_pred})
y_train_pred_final['CustID'] = y_train.index
y_train_pred_final.head()

#### Creating new column 'predicted with 1 if churn_Prob>0.5 else 0

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
from sklearn import metrics

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Predicted     not_churn    churn
# Actual
# not_churn        3270      365
# churn            579       708  

In [None]:
# Let's check the overall accuracy
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted))

#### Checking VIFs

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif["Feature"] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif["VIF"]=round(vif['VIF'],2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### There are a few variables with high VIF. It's best to drop these variables as they aren't helping much with prediction and unnecessarily making the model complex. The variable 'MonthlyCharges' has the highest VIF. So let's start by dropping that.

In [None]:
col


In [None]:
col = col.drop(['MonthlyCharges'],1)

In [None]:
col

In [None]:
# Let's re-un the model using the selected Variable
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train, X_train_sm, famliy = sm.families.Binomial())
res = logm3.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['Churn_Prob'] = y_train_pred

In [None]:
# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x>0.5 else 0)
y_train_pred_final.head()

In [None]:
# Let's Check the overall accuracy
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted))

So overall the accuracy has dropped much

#### Let Check the VIF again

In [None]:
vif = pd.DataFrame()
vif["Features"] = X_train[col].columns
vif["VIF"] = [variance_inflation_factor(X_train[col].values, 1) for i in range(X_train[col].shape[1])]
vif["VIF"] = round(vif["VIF"],2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

All variables have a good value of VIF. So we need not drop any more variables and we can proceed with making predictions using this model only

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Churn,y_train_pred_final.predicted)
confusion

In [None]:
# Actual/Predicted     not_churn    churn
        # not_churn        3294     341
        # churn            627       660  

In [None]:
# Let's check the overall accuracy
metrics.accuracy_score(y_train_pred_final.Churn,y_train_pred_final.predicted)

## Metrics beyond simply Accuracy

In [None]:
TP = confusion[1,1] # true positive
TN = confusion[0,0] # true negative
FP = confusion[0,1] # false positive
FN = confusion[1,0]

In [None]:
# Let's see the sentivity of logisitic regression model.
TP/float(TP+FN)

In [None]:
# Let's us calculate specificity
TN/ float(TN+FP)

In [None]:
# Calculate the fale positve rate - predicting churn when customer does not have churned
print(FP/ float(TN+FP))

In [None]:
# Positive predictive value
print(TP/float(TP+FP))

In [None]:
# Negative predicitve value
print(TN/ float(TN+FN))

## Step 9: Plotting yhe ROC Curve

An ROC Curve demonstrates several things:
- It shows the tradeoff between sentivity and specificity (any increase in