In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

telecom_df=pd.read_csv("telecom_churn_data.csv",  encoding = "ISO-8859-1")#reading dataframe from csv file

FileNotFoundError: [Errno 2] No such file or directory: 'telecom_churn_data.csv'

In [None]:
telecom_df.head()

In [None]:
telecom_df.info()

In [None]:
#dropping columns with 0 values and columns not required for modelling
telecom_df=telecom_df.drop(columns=['mobile_number','circle_id','loc_og_t2o_mou','std_og_t2o_mou','loc_ic_t2o_mou'])

In [None]:
telecom_df.shape

In [None]:
#splitting categorical and continuous variables
cont_tel_df=telecom_df.select_dtypes(include=['int64','float64'])
cat_tel_df=telecom_df.select_dtypes(include=['object'])
print(cont_tel_df.columns)
print(cat_tel_df.columns)

In [None]:
#dropping all date columns
telecom_df=telecom_df.drop(columns=cat_tel_df)

In [None]:
telecom_df.shape

In [None]:
#extracting columns with 50% null values
cols=telecom_df.loc[:,telecom_df.isna().mean()>=.5]

In [None]:
cols.columns

In [None]:
#dropping columns with 50% null values
telecom_df=telecom_df.drop(columns=['max_rech_data_6', 'max_rech_data_7',
       'max_rech_data_8', 'max_rech_data_9', 'count_rech_2g_6',
       'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_2g_9',
       'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8',
       'count_rech_3g_9', 'av_rech_amt_data_6', 'av_rech_amt_data_7',
       'av_rech_amt_data_8', 'av_rech_amt_data_9', 'arpu_3g_6', 'arpu_3g_7',
       'arpu_3g_8', 'arpu_3g_9', 'arpu_2g_6', 'arpu_2g_7', 'arpu_2g_8',
       'arpu_2g_9', 'night_pck_user_6', 'night_pck_user_7', 'night_pck_user_8',
       'night_pck_user_9', 'fb_user_6', 'fb_user_7', 'fb_user_8', 'fb_user_9'])

In [None]:
telecom_df.shape

In [None]:
# extracting columns with any null values
cols=telecom_df.loc[:,telecom_df.isna().any()]

In [None]:
cols

In [None]:
# dropping rows with any 50% values
telecom_df = telecom_df.dropna(thresh=53)


In [None]:
telecom_df.shape

In [None]:
telecom_df_1=telecom_df[cols.columns]

In [None]:
cols.columns

In [None]:
#replacing null values with 0
telecom_df.fillna(0,inplace=True)

In [None]:
# extracting columns with any null values
cols=telecom_df.loc[:,telecom_df.isna().any()]

In [None]:

telecom_df['total_amt_6']=telecom_df['total_rech_amt_6']+telecom_df['total_rech_data_6']# calculating total amount spent in recharge 6th month
telecom_df['total_amt_7']=telecom_df['total_rech_amt_7']+telecom_df['total_rech_data_7']# calculating total amount spent in recharge 7th month
telecom_df['total_amt_8']=telecom_df['total_rech_amt_8']+telecom_df['total_rech_data_8']# calculating total amount spent in recharge 8th month
telecom_df['total_amt_9']=telecom_df['total_rech_amt_9']+telecom_df['total_rech_data_9']# calculating total amount spent in recharge 9th month
telecom_df['total_usage']=telecom_df['total_og_mou_9']+telecom_df['total_ic_mou_9']+telecom_df['vol_2g_mb_9']+telecom_df['vol_3g_mb_9']#calculating total usage of calls and data in 9th month

In [None]:
#calculating average recharge amount of 6th and 7th month
average_rec_amt=(telecom_df['total_amt_6'].sum()+telecom_df['total_amt_7'].sum())/(2*len(telecom_df))

In [None]:
average_rec_amt

In [None]:
#filtering top 30% customers
telecom_df=telecom_df.loc[(telecom_df.total_amt_6+telecom_df.total_amt_7)/2 >=(average_rec_amt*.7)]

In [None]:
telecom_df.shape

In [None]:
#extracting all columns of 9th month
sep_cols = [col for col in telecom_df.columns if '9' in col or 'sep' in col]


In [None]:
sep_cols

In [None]:
# dropping all columns of 9th month
telecom_df=telecom_df.drop(columns=sep_cols)


In [None]:
telecom_df.shape

In [None]:
#extracting columns with 90% 0 values
telecom_df.quantile(0.9)
drop_cols=[col for col in telecom_df.columns if telecom_df[col].quantile(0.9)<=0.0]

In [None]:
# predicting churn customers and the target variable
telecom_df['churn'] = telecom_df.total_usage.apply(lambda x: 1 if x == 0.0 else 0)
telecom_df=telecom_df.drop(columns='total_usage')

In [None]:
#percentage of churn customers
len(telecom_df.loc[telecom_df['churn']==1])/len(telecom_df)

In [None]:
drop_cols

In [None]:
#dropping columns with 90% 0 values
telecom_df=telecom_df.drop(columns=drop_cols)

In [None]:
telecom_df.shape

In [None]:
#checking outliers of 1st 20 columns
telecom_df.iloc[:,0:20].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers of 1st 20 columns
telecom_df=telecom_df.loc[telecom_df.arpu_6<=telecom_df.arpu_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.arpu_7<=telecom_df.arpu_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.arpu_8<=telecom_df.arpu_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.onnet_mou_6<=telecom_df.onnet_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.onnet_mou_7<=telecom_df.onnet_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.onnet_mou_8<=telecom_df.onnet_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.offnet_mou_6<=telecom_df.offnet_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.offnet_mou_7<=telecom_df.offnet_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.offnet_mou_8<=telecom_df.offnet_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.roam_ic_mou_6<=telecom_df.roam_ic_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.roam_ic_mou_7<=telecom_df.roam_ic_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.roam_ic_mou_8<=telecom_df.roam_ic_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.roam_og_mou_6<=telecom_df.roam_og_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.roam_og_mou_7<=telecom_df.roam_og_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.roam_og_mou_8<=telecom_df.roam_og_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2t_mou_6<=telecom_df.loc_og_t2t_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2t_mou_7<=telecom_df.loc_og_t2t_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2t_mou_8<=telecom_df.loc_og_t2t_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2m_mou_6<=telecom_df.loc_og_t2m_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2m_mou_7<=telecom_df.loc_og_t2m_mou_7.quantile(0.99)]

In [None]:
telecom_df.iloc[:,0:20].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#checking outliers of 20th-40th columns
telecom_df.iloc[:,20:40].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers of 20th-40th columns
telecom_df=telecom_df.loc[telecom_df.loc_og_t2m_mou_8<=telecom_df.loc_og_t2m_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2f_mou_6<=telecom_df.loc_og_t2f_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2f_mou_7<=telecom_df.loc_og_t2f_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2f_mou_8<=telecom_df.loc_og_t2f_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2c_mou_6<=telecom_df.loc_og_t2c_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_t2c_mou_7<=telecom_df.loc_og_t2c_mou_7.quantile(0.99)]

In [None]:
telecom_df.iloc[:,0:20].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
telecom_df.iloc[:,20:40].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers of 20th-40th columns
telecom_df=telecom_df.loc[telecom_df.loc_og_t2c_mou_8<=telecom_df.loc_og_t2c_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_mou_6<=telecom_df.loc_og_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_og_mou_7<=telecom_df.loc_og_mou_7.quantile(0.99)]

telecom_df=telecom_df.loc[telecom_df.std_og_mou_6<=telecom_df.std_og_mou_6.quantile(0.99)]

telecom_df=telecom_df.drop(columns=['std_og_t2f_mou_6','std_og_t2f_mou_7','std_og_t2f_mou_8','og_others_6'])#dropping cokumns with maximum 0 values


In [None]:
#checking outliers of 20th-40th columns
telecom_df.iloc[:,20:40].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers of 20th-40th columns
telecom_df=telecom_df.loc[telecom_df.std_og_mou_7<=telecom_df.std_og_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.std_og_mou_8<=telecom_df.std_og_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.spl_og_mou_6<=telecom_df.spl_og_mou_6.quantile(0.99)]


In [None]:

telecom_df.iloc[:,20:40].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#checking outliers of 40th-60th columns
telecom_df.iloc[:,40:60].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers of 40th-60th columns
telecom_df=telecom_df.loc[telecom_df.spl_og_mou_7<=telecom_df.spl_og_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.spl_og_mou_8<=telecom_df.spl_og_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.total_og_mou_6<=telecom_df.total_og_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.total_og_mou_8<=telecom_df.total_og_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_t2t_mou_6<=telecom_df.loc_ic_t2t_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_t2t_mou_7<=telecom_df.loc_ic_t2t_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_t2t_mou_8<=telecom_df.loc_ic_t2t_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_t2m_mou_6<=telecom_df.loc_ic_t2m_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_t2m_mou_7<=telecom_df.loc_ic_t2m_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_t2m_mou_8<=telecom_df.loc_ic_t2m_mou_8.quantile(0.99)]

telecom_df.iloc[:,40:60].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers of 40th-60th columns
telecom_df=telecom_df.loc[telecom_df.loc_ic_t2f_mou_6<=telecom_df.loc_ic_t2f_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_t2f_mou_7<=telecom_df.loc_ic_t2f_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_t2f_mou_8<=telecom_df.loc_ic_t2f_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_mou_6<=telecom_df.loc_ic_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_mou_7<=telecom_df.loc_ic_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.loc_ic_mou_8<=telecom_df.loc_ic_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.std_ic_t2t_mou_6<=telecom_df.std_ic_t2t_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.std_ic_t2t_mou_7<=telecom_df.std_ic_t2t_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.std_ic_t2t_mou_8<=telecom_df.std_ic_t2t_mou_8.quantile(0.99)]


In [None]:
#checking outliers of 40th-60th columns
telecom_df.iloc[:,40:60].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])
telecom_df.shape

In [None]:
#checking outliers of 60th-80th columns
telecom_df.iloc[:,60:80].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])


In [None]:
#removing outliers of 60th-80th columns
telecom_df=telecom_df.loc[telecom_df.std_ic_t2m_mou_6<=telecom_df.std_ic_t2m_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.std_ic_t2m_mou_7<=telecom_df.std_ic_t2m_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.std_ic_t2m_mou_8<=telecom_df.std_ic_t2m_mou_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.std_ic_t2f_mou_6<=telecom_df.std_ic_t2f_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.std_ic_t2f_mou_7<=telecom_df.std_ic_t2f_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.std_ic_t2f_mou_8<=telecom_df.std_ic_t2f_mou_8.quantile(0.99)]

In [None]:
telecom_df.iloc[:,40:60].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#checking outliers of 60th-80th columns
telecom_df.iloc[:,60:80].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers of 60th-80th columns
telecom_df=telecom_df.loc[telecom_df.total_ic_mou_6<=telecom_df.total_ic_mou_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.total_ic_mou_7<=telecom_df.total_ic_mou_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.total_ic_mou_8<=telecom_df.total_ic_mou_8.quantile(0.99)]


In [None]:
telecom_df.iloc[:,60:80].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#dropping columns with maximum 0 values
telecom_df=telecom_df.drop(columns=['spl_ic_mou_6','isd_ic_mou_7','isd_ic_mou_8','ic_others_6','ic_others_7','ic_others_8'])
telecom_df.iloc[:,60:80].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#dropping columns with maximum 0 values
telecom_df=telecom_df.drop(columns=['isd_ic_mou_6'])
telecom_df.iloc[:,60:80].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers of 60th-80th columns
telecom_df=telecom_df.loc[telecom_df.total_rech_num_6<=telecom_df.total_rech_num_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.total_rech_num_7<=telecom_df.total_rech_num_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.total_rech_num_8<=telecom_df.total_rech_num_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.total_rech_amt_6<=telecom_df.total_rech_amt_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.total_rech_amt_7<=telecom_df.total_rech_amt_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.total_rech_amt_8<=telecom_df.total_rech_amt_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.max_rech_amt_6<=telecom_df.max_rech_amt_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.max_rech_amt_7<=telecom_df.max_rech_amt_7.quantile(0.99)]

telecom_df.iloc[:,60:80].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#checking outliers of 80th-100th columns
telecom_df.iloc[:,80:100].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers of 80th-100th columns
telecom_df=telecom_df.loc[telecom_df.max_rech_amt_8<=telecom_df.max_rech_amt_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.last_day_rch_amt_8<=telecom_df.last_day_rch_amt_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.vol_2g_mb_6<=telecom_df.vol_2g_mb_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.vol_2g_mb_7<=telecom_df.vol_2g_mb_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.vol_2g_mb_8<=telecom_df.vol_2g_mb_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.vol_3g_mb_6<=telecom_df.vol_3g_mb_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.vol_3g_mb_7<=telecom_df.vol_3g_mb_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.vol_3g_mb_8<=telecom_df.vol_3g_mb_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.monthly_2g_6<=telecom_df.monthly_2g_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.monthly_2g_7<=telecom_df.monthly_2g_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.sachet_2g_6<=telecom_df.sachet_2g_6.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.sachet_2g_7<=telecom_df.sachet_2g_7.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.sachet_2g_8<=telecom_df.sachet_2g_8.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.aug_vbc_3g<=telecom_df.aug_vbc_3g.quantile(0.99)]

telecom_df.iloc[:,80:100].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:

telecom_df.iloc[:,80:100].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
# dropping columns with maximum 0 values
telecom_df=telecom_df.drop(columns=['monthly_2g_6','monthly_2g_7','sachet_2g_6','sachet_2g_7','sachet_2g_8','total_rech_data_6','total_rech_data_7','total_rech_data_8'])


telecom_df.iloc[:,80:].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#removing outliers from 80th columns
telecom_df=telecom_df.loc[telecom_df.jun_vbc_3g<=telecom_df.jun_vbc_3g.quantile(0.99)]
telecom_df=telecom_df.loc[telecom_df.jul_vbc_3g<=telecom_df.jul_vbc_3g.quantile(0.99)]

In [None]:
#checking outliers from 80th columns
telecom_df.iloc[:,80:].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#pecentage of churned customers
len(telecom_df.loc[telecom_df['churn']==1])/len(telecom_df)

In [None]:
telecom_df.shape

In [None]:
from sklearn import linear_model
import sklearn
from sklearn.model_selection import train_test_split
#splitting into train and test data
df_train,df_test=train_test_split(telecom_df, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
# splitting the target column
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Y_train=df_train.pop('churn')
X_train=df_train
print(Y_train.head())
print(X_train.head())

In [None]:
#scaling train dataset
X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])
X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#splitting target column of test dataset
lm = LogisticRegression(class_weight='balanced')
Y_test=df_test.pop('churn')
X_test=df_test

In [None]:
# perform scaling on test dataset
X_test[X_test.columns]=scaler.transform(X_test[X_test.columns])


### Perform PCA on the data to transform the dataset so that all features are utilized

In [None]:
from sklearn.decomposition import PCA
pca = PCA(random_state=123)
pca.fit(X_train)
pca.components_

In [None]:
pca.explained_variance_ratio_

In [None]:
var_cumu = np.cumsum(pca.explained_variance_ratio_)

### Scree Plot for no of components VS total variance explained

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=[12,8])
plt.vlines(x=35, ymax=1, ymin=0, colors="r", linestyles="--")
plt.hlines(y=0.90, xmax=35, xmin=0, colors="g", linestyles="--")
plt.plot(var_cumu)
plt.ylabel("Cumulative variance explained")
plt.show()

#### From the above scree plot, it is observed that 90% variance is explained by 35 components.So taking n_components=35

In [None]:
# Performing PCA on training dataset
from sklearn.decomposition import IncrementalPCA
pca_final = IncrementalPCA(n_components=35)
df_train_pca = pca_final.fit_transform(X_train)
df_train_pca.shape

In [None]:
#Plotting heatmap for the components from PCA
corrmat = np.corrcoef(df_train_pca.transpose())
plt.figure(figsize=[35,35])
sns.heatmap(corrmat, annot=True)

In [None]:
# Performing PCA on testing dataset
df_test_pca = pca_final.transform(X_test)

df_test_pca.shape

### Build Logistic Regression model on the dataset after performing PCA on them

In [None]:
learner_pca = LogisticRegression(class_weight='balanced')
model_pca = learner_pca.fit(df_train_pca, Y_train)
pred_probs_test = model_pca.predict_proba(df_test_pca)
"{:2.2}".format(metrics.roc_auc_score(Y_test, pred_probs_test[:,1]))

In [None]:
pca_again = PCA(0.90)
df_train_pca2 = pca_again.fit_transform(X_train)
df_train_pca2.shape

In [None]:
learner_pca2 = LogisticRegression(class_weight='balanced')
model_pca2 = learner_pca2.fit(df_train_pca2, Y_train)
df_test_pca2 = pca_again.transform(X_test)
df_test_pca2.shape

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

pred_probs_test2 = model_pca2.predict_proba(df_test_pca2)[:,1]
"{:2.2}".format(metrics.roc_auc_score(Y_test, pred_probs_test2))
y_train_pred_final = pd.DataFrame({'Churn':Y_test.values,'Churn_Prob':pred_probs_test2})
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.5 else 0)

print(classification_report(y_train_pred_final.Churn, y_train_pred_final.predicted))

print(metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted))
print("accuracy:", metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted), "\n")

confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted)
specificity1 = confusion[1,1]/(confusion[1,0]+confusion[1,1])
print('Specificity : ', specificity1)

### Build SVM model using linear kernel on the dataset after performing PCA on them

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix

model_linear = SVC(class_weight='balanced',kernel='linear')
model_linear.fit(df_train_pca2, Y_train)

# predict
y_pred = model_linear.predict(df_test_pca2)
# confusion matrix and accuracy

# accuracy
print("accuracy:", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")

# cm
print(classification_report(Y_test,y_pred))

print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))
confusion = metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred)
specificity1 = confusion[1,1]/(confusion[1,0]+confusion[1,1])
print('Specificity : ', specificity1)

### Build SVM model using Radial Basis Function (RBF) kernel on the dataset after performing PCA

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix

model_linear = SVC(class_weight='balanced',kernel='rbf')
model_linear.fit(df_train_pca2, Y_train)

# predict
y_pred = model_linear.predict(df_test_pca2)
# confusion matrix and accuracy

# accuracy
print("accuracy:", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")

# cm
print(classification_report(Y_test,y_pred))

print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))
confusion = metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred)
specificity1 = confusion[1,1]/(confusion[1,0]+confusion[1,1])
print('Specificity : ', specificity1)

### Tuning hyperparameters to build Random Forest model

In [None]:
#tuning hyperparameter max_depth

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
#from sklearn.ensemble import RandomForestClassifier



# specify number of folds for k-fold CV 
n_folds = 5

# parameters to build the model on
parameters = {'max_depth': range(2, 20, 5)}

# instantiate the model
rf = RandomForestClassifier()


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   return_train_score=True,
                   scoring="accuracy")
rf.fit(df_train_pca2, Y_train)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with max_depth
plt.figure()
plt.plot(scores["param_max_depth"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_max_depth"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


In [None]:

# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'n_estimators': range(100, 3000, 400)}

# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                  return_train_score=True,
                   scoring="accuracy")
rf.fit(df_train_pca2, Y_train)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with n_estimators
plt.figure()
plt.plot(scores["param_n_estimators"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_n_estimators"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


In [None]:
# GridSearchCV to find optimal max_features
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'max_features': [4, 8, 14, 20, 34,40]}

# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                  return_train_score=True,
                   scoring="accuracy")
rf.fit(df_train_pca2, Y_train)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with n_estimators
plt.figure()
plt.plot(scores["param_max_features"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_max_features"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_features")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


In [None]:
# GridSearchCV to find optimal min_samples_leaf
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'min_samples_leaf': range(100, 400, 50)}

# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                  return_train_score=True,
                   scoring="accuracy")
rf.fit(df_train_pca2, Y_train)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with n_estimators
plt.figure()
plt.plot(scores["param_min_samples_leaf"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_min_samples_leaf"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("min_samples_leaf")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


In [None]:
# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'min_samples_split': range(200, 500, 50)}

# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(class_weight='balanced')


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                  return_train_score=True,
                   scoring="accuracy")
rf.fit(df_train_pca2, Y_train)
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()
# plotting accuracies with n_estimators
plt.figure()
plt.plot(scores["param_min_samples_split"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_min_samples_split"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("min_samples_split")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


In [None]:
# GridSearchCV to find optimal hyperparameters
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
param_grid = {
    'max_depth': [4,8,10],
    'min_samples_leaf': range(100, 400, 200),
    'min_samples_split': range(200, 500, 200),
    'n_estimators': [100,200, 300], 
    'max_features': [5, 10]}

# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(class_weight='balanced')
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, return_train_score=True,
                          cv = 3, n_jobs = -1,verbose = 1)
grid_search.fit(df_train_pca2, Y_train)
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)



### Building Random Forest model with tuned hyperparameters of the dataset after performing PCA

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(class_weight='balanced',
                             bootstrap=True,
                             max_depth=12,
                             min_samples_leaf=150, 
                             min_samples_split=100,
                             max_features=20,
                             n_estimators=500)
rfc.fit(df_train_pca2, Y_train)

In [None]:

predictions = rfc.predict(df_test_pca2)

In [None]:
# evaluation metrics
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(Y_test,predictions))
print(confusion_matrix(Y_test,predictions))
print("accuracy:", metrics.accuracy_score(y_true=Y_test, y_pred=predictions), "\n")


In [None]:
confusion = metrics.confusion_matrix(Y_test,predictions)
specificity1 = confusion[1,1]/(confusion[1,0]+confusion[1,1])
print('Specificity : ', specificity1)

#### Perform RFE to obtain best 15 features from the training dataset

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(lm, 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, Y_train)
col = X_train.columns[rfe.support_]

In [None]:
#checking VIFs of the top 15 variables
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#drop column with VIF>5
col=col.drop('total_og_mou_8')
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#drop column with VIF>5
col=col.drop('offnet_mou_8')
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#drop column with VIF>5
col=col.drop('total_amt_8')
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#build GLM model to calculate the coefficients
import statsmodels.api as sm
X_train_new=X_train[col]
X_train_sm = sm.add_constant(X_train_new)
logm2 = sm.GLM(Y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()


### Build Logistic Regression model without PCA

In [None]:

X_test_new=X_test[col]
X_test_sm = sm.add_constant(X_test_new)

lr = LogisticRegression(class_weight='balanced')

model_lr = lr.fit(X_train_sm, Y_train)


pred_probs_test2 = model_lr.predict_proba(X_test_sm)[:,1]
"{:2.2}".format(metrics.roc_auc_score(Y_test, pred_probs_test2))


In [None]:
y_train_pred_final = pd.DataFrame({'Churn':Y_test.values,'Churn_Prob':pred_probs_test2})
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.5 else 0)


In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted )
print(confusion)

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted))


In [None]:
print(classification_report(y_train_pred_final.Churn, y_train_pred_final.predicted))
specificity1 = confusion[1,1]/(confusion[1,0]+confusion[1,1])
print('Specificity : ', specificity1)

## Conclusions

##### Here, the model to be chosen for this dataset after applying PCA is Logistic Regression since the highest sensitivity
##### is obtained from Logistic Regression model. Since our concern is to predict the churn customers, and the factors affecting them,
##### so the metric to be considered for this model is Sensitivity.

##### The top 5 feature variables after performing RFE and Logistic Regression are:-
##### 1.loc_ic_mou_8
##### 2.last_day_rch_amt_8
##### 3.total_rech_num_8
##### 4.max_rech_amt_8
##### 5.std_ic_t2f_mou_8