In [None]:
import pandas as pd
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics


path=r'data/LogisticRegression/Telecom Churn/'
df_cust=pd.read_csv(path+'customer_data.csv')
df_curn=pd.read_csv(path+'churn_data.csv')
df_int=pd.read_csv(path+'internet_data.csv')

df_1= pd.merge(df_cust,df_curn , how='inner',on='customerID')
df_telecom =pd.merge(df_1, df_int, how='inner',on='customerID')

In [None]:
df_telecom.head()

In [None]:
df_telecom.info()

In [None]:
df_telecom.describe()

In [None]:
df_telecom['Contract'].astype('category').value_counts()

In [None]:
df_telecom.select_dtypes(include=['float64', 'int64'])

In [None]:
bin_cols=['PhoneService','PaperlessBilling','Churn','Partner','Dependents','MultipleLines',
         'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV' ,'StreamingMovies']
cat_cols=['PaymentMethod','Contract','gender','InternetService']
num_cols=['MonthlyCharges','TotalCharges','tenure','SeniorCitizen']

df1=df_telecom[df_telecom['MultipleLines']== 'No phone service']
df_telecom.loc[df1.index,'MultipleLines']='No'

df_telecom[bin_cols]=df_telecom[bin_cols].apply(lambda s: s.map({'Yes':1 ,'No' :0}))

df_telecom[bin_cols].head()




In [None]:
df_dummies= pd.get_dummies(df_telecom[cat_cols], drop_first=True)

df_dummies.head()

df_telecom[cat_cols].head()

df_telecom= pd.concat([df_telecom,df_dummies], axis=1)
df_telecom.head()

In [None]:
df_telecom=df_telecom.drop(cat_cols , axis=1)

df_telecom.head()

In [None]:
df_telecom[num_cols]=df_telecom[num_cols].apply(pd.to_numeric, errors="ignore")
df_telecom['tenure'].isnull().sum()

In [None]:
df_telecom.info()

In [None]:
df_telecom.isnull().sum()

In [None]:
cols= list(df_telecom.columns[df_telecom.isnull().sum()>0])

df_telecom[cols]=df_telecom[cols].fillna(0)

df_telecom=df_telecom[~df_telecom['TotalCharges'].map(lambda a :  a if bool(a.strip()) else None ).isnull()]

In [None]:
#test train split


x= df_telecom.drop(['customerID','Churn'],axis=1)
y= df_telecom['Churn']

x.head()


In [None]:
x_train,x_test,y_train,y_test= train_test_split(x,y,train_size=0.7)

In [None]:

scaler=StandardScaler()

cols_scale=['tenure','MonthlyCharges','TotalCharges']
x_train[cols_scale]= scaler.fit_transform(x_train[cols_scale])

x_train.head()



In [None]:
#churn rate 

rate = sum(y_train )/ len(y_train)
round(rate*100)

In [None]:

plt.figure(figsize=(20,20))
sns.heatmap(df_telecom.corr(),annot=True)

In [None]:
#modeling 


logml = sm.GLM(y_train, sm.add_constant(x_train),family=sm.families.Binomial())
m=logml.fit()
m.summary()

In [None]:

lg= LogisticRegression()


rfe=RFE(lg,15)
rfe=rfe.fit(x_train, y_train)

cols= x_train.columns[rfe.support_]

In [None]:

logml = sm.GLM(y_train, sm.add_constant(x_train[cols]),family=sm.families.Binomial())
m=logml.fit()
m.summary()

In [None]:

y_train_pred= m.predict(sm.add_constant(x_train[cols]))
y_train_pred_result =y_train_pred.map(lambda s : 1 if s >0.6 else 0 )

confusion = metrics.confusion_matrix(y_train,y_train_pred_result )

metrics.accuracy_score(y_train,y_train_pred_result )

#3250+693/ (3250+693+372+607)

In [None]:
confusion

#Actual/ predict  NOT CHURN | CHURN
#      NOT CHURN  True Neg.   False Pos.    
#         CHURN   False Neg.  True Pos.

TN=confusion[0][0]
TP=confusion[1][1]
FP=confusion[0][1]
FN=confusion[1][0]


TP_rate= TP / (TP+FN)
FP_rate= FP / (FP+TN)

print (TP_rate, FP_rate)

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

draw_roc(y_train,y_train_pred)

In [None]:
numbers= [n/10 for n in range(10)]
for n in numbers:    
    y_train_pred[n]= y_train_pred.map(lambda s: 1 if s >n else 0 ) 
    
y_train_pred

In [None]:





cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train, y_train_pred[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [6]:

n=15

def fab(f,s, max_val):
    if(f==0 and s==0):
        print(0)
        f=1
    else :
        sum_= f+s
        f=s
        s=sum_
        print(sum_)
    
    if(s< max_val):
        fab(f,s,max_val)

        
fab(0,0,n)           
            


0
1
1
2
3
5
8
13
21
