# Importing Required Library

In [None]:
# Importing helpful pacakages to load in

import numpy as np   # linear algebra
import pandas as pd   # data processing, CSV file I/O

#For plotting
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)

# Until fuction: line seperator
def print_dashes_and_ln():
    print('-'*100, '\n')
    
# Formatter to display all float format in 2 decimal format
pd.options.display.float_format = '{:.2f}'.format

# Loading data from csv file

In [None]:
# loading the loan data

rawchurn = pd.read_csv('telecom_churn_data.csv' , sep = ',',engine = 'python')

# 1. Data Understanding

In [None]:
# Snapshot of Chrun Data
print(rawchurn.head(10)); print_dashes_and_ln();
print(rawchurn.shape); print_dashes_and_ln();
print('This dataset has ' + str(rawchurn.shape[0]) + ' rows, and ' + str(rawchurn.shape[1]) + ' columns'); print_dashes_and_ln();

In [None]:
rawchurn.info(verbose = 1)

In [None]:
rawchurn.describe(include = 'all')

# 2. Data Manipulation

We have to create id columns, date columns, category columns and numeric columns to understand the data better

In [None]:
id_columns = ['mobile_number', 'circle_id']

date_columns = [
             'date_of_last_rech_data_6',
             'date_of_last_rech_data_7',
             'date_of_last_rech_data_8',
             'date_of_last_rech_data_9',
             'date_of_last_rech_6',
             'date_of_last_rech_7',
             'date_of_last_rech_8',
             'date_of_last_rech_9',
             'last_date_of_month_6',
             'last_date_of_month_7',
             'last_date_of_month_8',
             'last_date_of_month_9'  
              ]

category_columns = [
             'fb_user_6',
             'fb_user_7',
             'fb_user_8',
             'fb_user_9',
             'night_pck_user_6',
             'night_pck_user_7',
             'night_pck_user_8',
             'night_pck_user_9'             
                  ]

numeric_columns = [column for column in rawchurn.columns if column not in id_columns + date_columns + category_columns]


In [None]:
# print the number of columns in each list
print(("id_Columns:{}").format(len(id_columns))); print_dashes_and_ln();
print(("date_columns:{}").format(len(date_columns))); print_dashes_and_ln();
print(("category_columns:{}").format(len(category_columns))); print_dashes_and_ln();
print(("numeric_columns:{}").format(len(numeric_columns))); print_dashes_and_ln();

In [None]:
print(("Total no. of Columns:{}").format(len(id_columns) +len(date_columns) + len(category_columns) + len(numeric_columns)))

we have taken care of the classification of all the variables that are present in churn data sets. Now we will check for missing values.

In [None]:
# There is 99999 rows and 226 columns. There are many columns which Contain NaN. Lets identify those columns and get rid of them
rawchurn.isna().sum()

In [None]:
# We will first check all the recharge columns
recharge_columns = [
                  'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9',
                  'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9',
                  'total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9'
                 ]

rawchurn[recharge_columns].describe(include='all')

In [None]:
# we can see that minimun value of mostly columns is zero and some cases it is one so we can replace the missing value with zero. It means that users has not recharged at all.

rawchurn[recharge_columns] = rawchurn[recharge_columns].apply(lambda x: x.fillna(0))


In [None]:
# Checking there is no missing values present
rawchurn[recharge_columns].isna().sum()

In [None]:
# Now we will drop date and id columns
print("Shape before dropping: ", rawchurn.shape); print_dashes_and_ln();
rawchurn = rawchurn.drop(id_columns + date_columns, axis=1)
print("Shape after dropping: ", rawchurn.shape); print_dashes_and_ln();

In [None]:
# For categorical columns, we will replace missing values with some thing and we will call this as new category. Lets choose -1.
rawchurn[category_columns] = rawchurn[category_columns].apply(lambda x: x.fillna(-1))

In [None]:
# Checking there is no missing values present
rawchurn[category_columns].isna().sum()

In [None]:
# We will now drop variables which has more than 70% missing data

initial_columns = rawchurn.shape[1]

THRESHOLD = 0.7

include_columns = list(rawchurn.apply(lambda column: True if column.isnull().sum()/rawchurn.shape[0] < THRESHOLD else False))

drop_missing = pd.DataFrame({'features':rawchurn.columns , 'include': include_columns})
drop_missing.loc[drop_missing.include == True,:]

In [None]:
# dropping the columns now
rawchurn = rawchurn.loc[:, include_columns]

dropped_cols = rawchurn.shape[1] - initial_columns
print("columns dropped.{0}".format(dropped_cols)); print_dashes_and_ln();

In [None]:
rawchurn.isna().sum()

In [None]:
# Filling all NA's with Zero for Now
rawchurn.fillna(0, inplace=True)
sum(rawchurn.isnull().sum()>0)

# 3. Data Preparation

In [None]:
# Function to Return Monthwise Columns List. Returns arrays of columns belonging to 6,7,8,9 month separately.
# Also returns an array of columns that are not month specific as common columns.

def returnColumnsByMonth(df):
    column_Month_6 = []
    column_Month_7 = []
    column_Month_8 = []
    column_Month_9 = []
    column_Common = []
    for eachColumns in df.columns:
        if((eachColumns.find("_6") >=0) | (eachColumns.find("jun_") >=0)):
            column_Month_6.append(eachColumns)
        elif((eachColumns.find("_7") >=0) | (eachColumns.find("jul_") >=0)):
            column_Month_7.append(eachColumns)
        elif((eachColumns.find("_8") >= 0) | (eachColumns.find("aug_") >=0)):
            column_Month_8.append(eachColumns)
        elif((eachColumns.find("_9") >=0) | (eachColumns.find("sep_") >=0)):
            column_Month_9.append(eachColumns)
        else:
            column_Common.append(eachColumns)
    return column_Month_6, column_Month_7, column_Month_8, column_Month_9, column_Common

In [None]:
# Get Columns Monthwise & Basic Understanding of Columns
column_Month_6, column_Month_7, column_Month_8, column_Month_9, column_Common = returnColumnsByMonth(rawchurn)

print("Month 6 Columns Count ==> {}".format(len(column_Month_6)))
print("Month 7 Columns Count ==> {}".format(len(column_Month_7)))
print("Month 8 Columns Count ==> {}".format(len(column_Month_8)))
print("Month 9 Columns Count ==> {}".format(len(column_Month_9)))
print("Common Columns Count ==> {}".format(len(column_Common)))

In [None]:
# All Months are having same type of columns So lets see the columns in general
print ("\nMonth based Columns:\n \t\t==> {}".format(np.array(column_Month_6)))
print ("\nCommon Columns:\n \t\t==> {}".format(np.array(column_Common)))

In [None]:
# Derive Columns Total_Recharge_Amount from 6th and 7th Month total_rech_amt
rawchurn['Total_Recharge_Amount'] = rawchurn['total_rech_amt_6'] + rawchurn['total_rech_amt_7']

# As per Upgrad guideline, we have to look at 70th percentile of the average recharge amount in the first two months (the good phase)
print(rawchurn['Total_Recharge_Amount'].describe(percentiles = [0.7])); print_dashes_and_ln();
print("\n70% of Total Recharge Amount of first 2 months are {}".format(rawchurn['Total_Recharge_Amount'].describe(percentiles = [0.7])[5])); print_dashes_and_ln();

In [None]:
# Filter High Value Customer with more than or equal to 70th percentile amount
rawchurn = rawchurn[rawchurn['Total_Recharge_Amount'] > 737].reset_index(drop=True)
print("\nTotal High Value Customer Count ==> {}".format(rawchurn.shape)); print_dashes_and_ln();
rawchurn.drop(columns=['Total_Recharge_Amount'], inplace=True)

In [None]:
# Tag churners and remove attributes of the churn phase
# calculate total incoming and outgoing minutes of usage
rawchurn['total_calls_mou_9'] = rawchurn.total_ic_mou_9 + rawchurn.total_og_mou_9
rawchurn['total_internet_usage_9'] =  rawchurn.vol_2g_mb_9 + rawchurn.vol_3g_mb_9

In [None]:
# tag the churned customers (churn=1, else 0)
rawchurn['churn'] = rawchurn.apply(lambda row: 1 if (row.total_calls_mou_9 == 0 and row.total_internet_usage_9 == 0) else 0, axis=1)

In [None]:
# delete derived variables
rawchurn = rawchurn.drop(['total_calls_mou_9', 'total_internet_usage_9'], axis=1)

In [None]:
# change data type to category
rawchurn.churn = rawchurn.churn.astype("category")

# print churn ratio
print("Churn Ratio:"); print_dashes_and_ln();
print(rawchurn.churn.value_counts()*100/rawchurn.shape[0]); print_dashes_and_ln();

# Churn is 8.64% which indicates an unbalanced datasets.

In [None]:
# Remove columns with '9'
rawchurn = rawchurn.filter(regex='[^9]$', axis=1)
rawchurn.shape

In [None]:
# extract all names that end with 9
col_9_names = rawchurn.filter(regex='9$', axis=1).columns

# update numeric_columns and category_columns list
category_columns = [col for col in category_columns if col not in col_9_names]
category_columns.append('churn')
numeric_columns = [col for col in rawchurn.columns if col not in category_columns]

In [None]:
# Lets look also at difference between the 8th month and the previous months

rawchurn['arpu_diff'] = rawchurn.arpu_8 - ((rawchurn.arpu_6 + rawchurn.arpu_7)/2)

rawchurn['onnet_mou_diff'] = rawchurn.onnet_mou_8 - ((rawchurn.onnet_mou_6 + rawchurn.onnet_mou_7)/2)

rawchurn['offnet_mou_diff'] = rawchurn.offnet_mou_8 - ((rawchurn.offnet_mou_6 + rawchurn.offnet_mou_7)/2)

rawchurn['roam_ic_mou_diff'] = rawchurn.roam_ic_mou_8 - ((rawchurn.roam_ic_mou_6 + rawchurn.roam_ic_mou_7)/2)

rawchurn['roam_og_mou_diff'] = rawchurn.roam_og_mou_8 - ((rawchurn.roam_og_mou_6 + rawchurn.roam_og_mou_7)/2)

rawchurn['loc_og_mou_diff'] = rawchurn.loc_og_mou_8 - ((rawchurn.loc_og_mou_6 + rawchurn.loc_og_mou_7)/2)

rawchurn['std_og_mou_diff'] = rawchurn.std_og_mou_8 - ((rawchurn.std_og_mou_6 + rawchurn.std_og_mou_7)/2)

rawchurn['isd_og_mou_diff'] = rawchurn.isd_og_mou_8 - ((rawchurn.isd_og_mou_6 + rawchurn.isd_og_mou_7)/2)

rawchurn['spl_og_mou_diff'] = rawchurn.spl_og_mou_8 - ((rawchurn.spl_og_mou_6 + rawchurn.spl_og_mou_7)/2)

rawchurn['total_og_mou_diff'] = rawchurn.total_og_mou_8 - ((rawchurn.total_og_mou_6 + rawchurn.total_og_mou_7)/2)

rawchurn['loc_ic_mou_diff'] = rawchurn.loc_ic_mou_8 - ((rawchurn.loc_ic_mou_6 + rawchurn.loc_ic_mou_7)/2)

rawchurn['std_ic_mou_diff'] = rawchurn.std_ic_mou_8 - ((rawchurn.std_ic_mou_6 + rawchurn.std_ic_mou_7)/2)

rawchurn['isd_ic_mou_diff'] = rawchurn.isd_ic_mou_8 - ((rawchurn.isd_ic_mou_6 + rawchurn.isd_ic_mou_7)/2)

rawchurn['spl_ic_mou_diff'] = rawchurn.spl_ic_mou_8 - ((rawchurn.spl_ic_mou_6 + rawchurn.spl_ic_mou_7)/2)

rawchurn['total_ic_mou_diff'] = rawchurn.total_ic_mou_8 - ((rawchurn.total_ic_mou_6 + rawchurn.total_ic_mou_7)/2)

rawchurn['total_rech_num_diff'] = rawchurn.total_rech_num_8 - ((rawchurn.total_rech_num_6 + rawchurn.total_rech_num_7)/2)

rawchurn['total_rech_amt_diff'] = rawchurn.total_rech_amt_8 - ((rawchurn.total_rech_amt_6 + rawchurn.total_rech_amt_7)/2)

rawchurn['max_rech_amt_diff'] = rawchurn.max_rech_amt_8 - ((rawchurn.max_rech_amt_6 + rawchurn.max_rech_amt_7)/2)

rawchurn['total_rech_data_diff'] = rawchurn.total_rech_data_8 - ((rawchurn.total_rech_data_6 + rawchurn.total_rech_data_7)/2)

rawchurn['max_rech_data_diff'] = rawchurn.max_rech_data_8 - ((rawchurn.max_rech_data_6 + rawchurn.max_rech_data_7)/2)

rawchurn['av_rech_amt_data_diff'] = rawchurn.av_rech_amt_data_8 - ((rawchurn.av_rech_amt_data_6 + rawchurn.av_rech_amt_data_7)/2)

rawchurn['vol_2g_mb_diff'] = rawchurn.vol_2g_mb_8 - ((rawchurn.vol_2g_mb_6 + rawchurn.vol_2g_mb_7)/2)

rawchurn['vol_3g_mb_diff'] = rawchurn.vol_3g_mb_8 - ((rawchurn.vol_3g_mb_6 + rawchurn.vol_3g_mb_7)/2)


# Visualise Data

In [None]:
# change columns types
category_columns1 = [
             'fb_user_6',
             'fb_user_7',
             'fb_user_8',
             'night_pck_user_6',
             'night_pck_user_7',
             'night_pck_user_8',             
                  ]
rawchurn[numeric_columns] = rawchurn[numeric_columns].apply(pd.to_numeric)
rawchurn[category_columns1] = rawchurn[category_columns1].apply(lambda column: column.astype("category"), axis=0)

In [None]:
# create plotting functions
def data_type(variable):
    if variable.dtype == np.int64 or variable.dtype == np.float64:
        return 'numerical'
    elif variable.dtype == 'category':
        return 'categorical'
    
def univariate(variable, stats=True):
    
    if data_type(variable) == 'numerical':
        sns.distplot(variable)
        if stats == True:
            print(variable.describe())
    
    elif data_type(variable) == 'categorical':
        sns.countplot(variable)
        if stats == True:
            print(variable.value_counts())
            
    else:
        print("Invalid variable passed: either pass a numeric variable or a categorical vairable.")
        
def bivariate(var1, var2):
    if data_type(var1) == 'numerical' and data_type(var2) == 'numerical':
        sns.regplot(var1, var2)
    elif (data_type(var1) == 'categorical' and data_type(var2) == 'numerical') or (data_type(var1) == 'numerical' and data_type(var2) == 'categorical'):        
        sns.boxplot(var1, var2)

# EDA For all the features

In [None]:
univariate(rawchurn.arpu_6)

In [None]:
univariate(rawchurn.loc_og_t2o_mou)

In [None]:
univariate(rawchurn.std_og_t2o_mou)

In [None]:
univariate(rawchurn.onnet_mou_8)

In [None]:
# Bivariate EDA
bivariate(rawchurn.churn, rawchurn.aon)

In [None]:
bivariate(rawchurn.sep_vbc_3g, rawchurn.churn)

In [None]:
bivariate(rawchurn.spl_og_mou_8, rawchurn.churn)

In [None]:
pd.crosstab(rawchurn.churn, rawchurn.night_pck_user_8, normalize='columns')*100

In [None]:
pd.crosstab(rawchurn.churn, rawchurn.sachet_3g_8)

In [None]:
X1 = rawchurn.groupby('churn')['aon'].agg(['mean']).reset_index()
p = sns.barplot(x='churn', y='mean', data=X1)
p.set_xticklabels(['Not-Churn', 'Churn'],rotation=30)
p.set_ylabel('Average Age in Network')
plt.title('Average Age in Network between Churn and Not-Churn subscriber')
plt.show()

Churn subscriber is having less average AON than Non-Churn Subscriber. Hence subsribers which has high AON has less chances of Churn

In [None]:
# Capping the outliers with k-sigma technique
def cap_outliers(array, k=3):
    upper_limit = array.mean() + k*array.std()
    lower_limit = array.mean() - k*array.std()
    array[array<lower_limit] = lower_limit
    array[array>upper_limit] = upper_limit
    return array
rawchurn[numeric_columns] = rawchurn[numeric_columns].apply(cap_outliers, axis=0)

Correlation matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize = (40,20))      
sns.heatmap(rawchurn.corr(),annot = True)

 Due to a large number of variables, we cannot visualize the correlation matrix properly. We will address this after PCA.

# Data Modelling


# # Data standardization and preparation

In [None]:
# Importing important libraries

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn import metrics

In [None]:
# Converting Churn datatype into numeric
rawchurn['churn'] = pd.to_numeric(rawchurn['churn'])

In [None]:
# divide data into train and test
X = rawchurn.drop("churn", axis = 1)
y = rawchurn.churn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 100, stratify = y)

In [None]:
# print no.of features and shapes of train and test sets
print("Number of Features ==> {}".format(len(X.columns))); print_dashes_and_ln();
X_train.shape
y_train.shape
X_test.shape
y_test.shape

In [None]:
train = pd.concat([X_train, y_train], axis=1)

# aggregate the categorical variables
train.groupby('night_pck_user_6').churn.mean()
train.groupby('night_pck_user_7').churn.mean()
train.groupby('night_pck_user_8').churn.mean()
train.groupby('fb_user_6').churn.mean()
train.groupby('fb_user_7').churn.mean()
train.groupby('fb_user_8').churn.mean()

In [None]:
# replace categories with aggregated values in each categorical column
mapping = {'night_pck_user_6' : {-1: 0.099087, 0: 0.064849, 1: 0.095833},
           'night_pck_user_7' : {-1: 0.106837, 0: 0.053456, 1: 0.075117},
           'night_pck_user_8' : {-1: 0.123411, 0: 0.028631, 1: 0.033981},
           'fb_user_6'        : {-1: 0.099087, 0: 0.081703, 1: 0.063913},
           'fb_user_7'        : {-1: 0.106837, 0: 0.070084, 1: 0.052000},
           'fb_user_8'        : {-1: 0.123411, 0: 0.062718, 1: 0.022138}
          }
X_train.replace(mapping, inplace = True)
X_test.replace(mapping, inplace = True)

In [None]:
# checking data type of categorical columns
X_train[[col for col in category_columns1 if col not in ['churn']]].info()

# Using PCA as dimensionality reduction technique which is mentioned in Upgrad problem statement

In [None]:
pca = Pipeline([('scaler', StandardScaler()), ('pca', PCA())])

In [None]:
pca.fit(X_train)
churn_pca = pca.fit_transform(X_train)

In [None]:
# extract pca model from pipeline
pca = pca.named_steps['pca']

# look at explainded variance of PCA components
print(pd.Series(np.round(pca.explained_variance_ratio_.cumsum(), 4)*100)); print_dashes_and_ln();

In [None]:
# plot feature variance
features = range(pca.n_components_)
cumulative_variance = np.round(np.cumsum(pca.explained_variance_ratio_)*100, decimals=4)
plt.figure(figsize=(175/20,100/20)) # 100 elements on y-axis; 175 elements on x-axis; 20 is normalising factor
plt.plot(cumulative_variance)

# As we can see from above graph and table that 60 variables explain 90% variance and 80 variables explain 95% variance

In [None]:
# PCA and Logistic Regression
# create pipeline
PCA_VARS = 60
steps = [('scaler', StandardScaler()),
         ("pca", PCA(n_components=PCA_VARS)),
         ("logistic", LogisticRegression(class_weight='balanced'))
        ]
pipeline = Pipeline(steps)

In [None]:
# fit model
pipeline.fit(X_train, y_train)

# check score on train data
pipeline.score(X_train, y_train)

In [None]:
# Checking on Test Data
y_pred = pipeline.predict(X_test)

# create confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm); print_dashes_and_ln();

print("Accuracy Score ==> {}".format(round(accuracy_score(y_test,y_pred),2))); print_dashes_and_ln();
print("AUC Score ==> {}".format(round(roc_auc_score(y_test,y_pred),2))); print_dashes_and_ln();

TP = (confusion_matrix(y_test,y_pred))[0][0]
FP = (confusion_matrix(y_test,y_pred))[0][1]
FN = (confusion_matrix(y_test,y_pred))[1][0]
TN = (confusion_matrix(y_test,y_pred))[1][1]
print("Not-Churn Accuracy Rate:(Specificity) ==> {}".format(round(TP/(TP+FP),3))); print_dashes_and_ln();
print("Churn Accuracy Rate:(Sensitivity) ==> {}".format(round(TN/(TN+FN),3))); print_dashes_and_ln();

# check area under curve
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
print("AUC Area: ", round(roc_auc_score(y_test, y_pred_prob),3)); print_dashes_and_ln();


In [None]:
# Hyperparameter tuning PCA and Logistic Regression
# Checing class imbalance
y_train.value_counts()/y_train.shape


In [None]:
# PCA
pca = PCA()

# logistic regression - the class weight is used to handle class imbalance - it adjusts the cost function
logistic = LogisticRegression(class_weight={0:0.1, 1: 0.9})

# create pipeline
steps = [("scaler", StandardScaler()), 
         ("pca", pca),
         ("logistic", logistic)
        ]

# compile pipeline
pca_logistic = Pipeline(steps)

# hyperparameter space
params = {'pca__n_components': [60, 80], 'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']}

# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

# create gridsearch object
model = GridSearchCV(estimator=pca_logistic, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
# fit model
model.fit(X_train, y_train)

In [None]:
# cross validation results
pd.DataFrame(model.cv_results_)

In [None]:
# print best hyperparameters
print("Best AUC: ", model.best_score_); print_dashes_and_ln();
print("Best hyperparameters: ", model.best_params_); print_dashes_and_ln();

In [None]:
# predict churn on test data
y_pred = model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm); print_dashes_and_ln();

print("Accuracy Score ==> {}".format(round(accuracy_score(y_test,y_pred),2))); print_dashes_and_ln();
print("AUC Score ==> {}".format(round(roc_auc_score(y_test,y_pred),2))); print_dashes_and_ln();

TP = (confusion_matrix(y_test,y_pred))[0][0]
FP = (confusion_matrix(y_test,y_pred))[0][1]
FN = (confusion_matrix(y_test,y_pred))[1][0]
TN = (confusion_matrix(y_test,y_pred))[1][1]
print("Not-Churn Accuracy Rate:(Specificity) ==> {}".format(round(TP/(TP+FP),3))); print_dashes_and_ln();
print("Churn Accuracy Rate:(Sensitivity) ==> {}".format(round(TN/(TN+FN),3))); print_dashes_and_ln();

# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC Area: ", round(roc_auc_score(y_test, y_pred_prob),3)); print_dashes_and_ln();
    

In [None]:
# Now we will check this with Random Forest

# random forest - the class weight is used to handle class imbalance - it adjusts the cost function
forest = RandomForestClassifier(class_weight={0:0.1, 1: 0.9}, n_jobs = -1)

# hyperparameter space
params = {"criterion": ['gini', 'entropy'], "max_features": ['auto', 0.4]}

# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

# create gridsearch object
model = GridSearchCV(estimator=forest, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
# fit model
model.fit(X_train, y_train)

In [None]:
# print best hyperparameters
print("Best AUC: ", model.best_score_); print_dashes_and_ln();
print("Best hyperparameters: ", model.best_params_); print_dashes_and_ln();

In [None]:
# predict churn on test data
y_pred = model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm); print_dashes_and_ln();


print("Accuracy Score ==> {}".format(round(accuracy_score(y_test,y_pred),2))); print_dashes_and_ln();
print("AUC Score ==> {}".format(round(roc_auc_score(y_test,y_pred),2))); print_dashes_and_ln();

TP = (confusion_matrix(y_test,y_pred))[0][0]
FP = (confusion_matrix(y_test,y_pred))[0][1]
FN = (confusion_matrix(y_test,y_pred))[1][0]
TN = (confusion_matrix(y_test,y_pred))[1][1]
print("Not-Churn Accuracy Rate:(Specificity) ==> {}".format(round(TP/(TP+FP),3))); print_dashes_and_ln();
print("Churn Accuracy Rate:(Sensitivity) ==> {}".format(round(TN/(TN+FN),3))); print_dashes_and_ln();

# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC Area: ", round(roc_auc_score(y_test, y_pred_prob),3)); print_dashes_and_ln();


# Sensitivity for this model is around 46% which is very less. we are going with PCA along with logisitic model. 

In [None]:
# run a random forest model on train data
max_features = int(round(np.sqrt(X_train.shape[1])))   
# number of variables to consider to split each node
print(max_features); print_dashes_and_ln();
rf_model = RandomForestClassifier(n_estimators=100, max_features=max_features, class_weight={0:0.1, 1: 0.9}, oob_score=True, random_state=4, verbose=1)

In [None]:
# fit model
rf_model.fit(X_train, y_train)

In [None]:
# OOB score
rf_model.oob_score_

In [None]:
# predict churn on test data
y_pred = model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm); print_dashes_and_ln();


print("Accuracy Score ==> {}".format(round(accuracy_score(y_test,y_pred),2))); print_dashes_and_ln();
print("AUC Score ==> {}".format(round(roc_auc_score(y_test,y_pred),2))); print_dashes_and_ln();

TP = (confusion_matrix(y_test,y_pred))[0][0]
FP = (confusion_matrix(y_test,y_pred))[0][1]
FN = (confusion_matrix(y_test,y_pred))[1][0]
TN = (confusion_matrix(y_test,y_pred))[1][1]
print("Not-Churn Accuracy Rate:(Specificity) ==> {}".format(round(TP/(TP+FP),3))); print_dashes_and_ln();
print("Churn Accuracy Rate:(Sensitivity) ==> {}".format(round(TN/(TN+FN),3))); print_dashes_and_ln();

# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC Area: ", round(roc_auc_score(y_test, y_pred_prob),3)); print_dashes_and_ln();

In [None]:
# Feature Inportance
# predictors
features = rawchurn.drop('churn', axis=1).columns

# feature_importance
importance = rf_model.feature_importances_

# create dataframe
feature_importance = pd.DataFrame({'variables': features, 'importance_percentage': importance*100})
feature_importance = feature_importance[['variables', 'importance_percentage']]

# sort features
feature_importance = feature_importance.sort_values('importance_percentage', ascending=False).reset_index(drop=True)
print("Sum of importance=", feature_importance.importance_percentage.sum()); print_dashes_and_ln();
feature_importance

In [None]:
# Extarcting top 25 Features

top_n = 25
top_features = feature_importance.variables[0:top_n]

In [None]:
# heat map
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize = (15,15))      
sns.heatmap(data=X_train[top_features].corr(),annot = True)

In [None]:
top_features = ['total_ic_mou_8', 'total_rech_amt_diff', 'total_og_mou_8', 'arpu_8', 'roam_ic_mou_8', 'roam_og_mou_8', 
                'std_ic_mou_8', 'std_og_mou_diff']
X_train = X_train[top_features]
X_test = X_test[top_features]

In [None]:
# logistic regression
steps = [('scaler', StandardScaler()), 
         ("logistic", LogisticRegression(class_weight={0:0.1, 1:0.9}))
        ]

# compile pipeline
logistic = Pipeline(steps)

# hyperparameter space
params = {'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']}

# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

# create gridsearch object
model = GridSearchCV(estimator=logistic, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
# fit model
model.fit(X_train, y_train)

In [None]:
# print best hyperparameters
print("Best AUC: ", model.best_score_); print_dashes_and_ln();
print("Best hyperparameters: ", model.best_params_); print_dashes_and_ln();

In [None]:
# predict churn on test data
y_pred = model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm); print_dashes_and_ln();


print("Accuracy Score ==> {}".format(round(accuracy_score(y_test,y_pred),2))); print_dashes_and_ln();
print("AUC Score ==> {}".format(round(roc_auc_score(y_test,y_pred),2))); print_dashes_and_ln();

TP = (confusion_matrix(y_test,y_pred))[0][0]
FP = (confusion_matrix(y_test,y_pred))[0][1]
FN = (confusion_matrix(y_test,y_pred))[1][0]
TN = (confusion_matrix(y_test,y_pred))[1][1]
print("Not-Churn Accuracy Rate:(Specificity) ==> {}".format(round(TP/(TP+FP),3))); print_dashes_and_ln();
print("Churn Accuracy Rate:(Sensitivity) ==> {}".format(round(TN/(TN+FN),3))); print_dashes_and_ln();

# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC Area: ", round(roc_auc_score(y_test, y_pred_prob),3))

In [None]:
logistic_model = model.best_estimator_.named_steps['logistic']
# intercept
intercept_df = pd.DataFrame(logistic_model.intercept_.reshape((1,1)), columns = ['intercept'])

In [None]:
# coefficients
coefficients = logistic_model.coef_.reshape((8, 1)).tolist()
coefficients = [val for sublist in coefficients for val in sublist]
coefficients = [round(coefficient, 3) for coefficient in coefficients]

logistic_features = list(X_train.columns)
coefficients_df = pd.DataFrame(logistic_model.coef_, columns=logistic_features)

In [None]:
# concatenate dataframes
coefficients = pd.concat([intercept_df, coefficients_df], axis=1)
coefficients

# Summary

1. Minutes of Usage on 8th Month for outgoing and incoming calls (Mostly Roaming/Local/STD) and 
2. Recharge amount difference

# Recommendation
1. If the total usage as measured by the total minutes of usage and the recharge amount in 7th and 8th month is declining as compared to 6th month, then it is likely that such a customer will churn. 
2. If the Total Outgoing Minutes of Usage falls below 220 minutes in the 8th, We recommend the telecom provider to reach out to such customers and provide them with lockin offers that will prevent their churn.
3. Telecom provider should also focus on STD and roaming rates