In [25]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

In [27]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

df.columns = df.columns.str.lower().str.replace(' ','_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ','_')

df.totalcharges = pd.to_numeric(df.totalcharges,errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)


In [28]:
df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=1)

In [29]:
numerical = ['tenure','monthlycharges','totalcharges']

categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']



In [30]:
def train(df_train,y_train,C=1.0):
    dicts= df_train[categorical+numerical].to_dict(orient='records')

    dv= DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train,y_train)

    return dv,model

In [31]:
def predict(df,dv,model):
    dicts = df[categorical+numerical].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:,1]
    return y_pred

In [32]:
C = 1
n_splits=5

In [33]:
kfold = KFold(n_splits=n_splits,shuffle=True,random_state=1)
scores = []
    
for train_idx, val_idx in kfold.split(df_full_train):
        
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
        
    y_train = df_train.churn.values
    y_val = df_val.churn.values
        
    dv, model = train(df_train, y_train,C=C)
    y_pred = predict(df_val, dv, model)
        
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
print('C=%s %.3f +- %.3f' %(C,np.mean(scores),np.std(scores)))

    

C=1 0.840 +- 0.008


In [34]:
scores

[0.8424103743592561,
 0.8455854357038802,
 0.8311739915713425,
 0.8301724275756219,
 0.8522520473157416]

In [35]:
dv, model = train(df_full_train, df_full_train.churn,C=0.5)
y_pred = predict(df_test, dv, model)
y_test = df_test.churn.values
auc = roc_auc_score(y_test, y_pred)
auc

0.8589015459282612

In [38]:
import pickle 
output_file = f'model_C={C}.bin'
output_file

'model_C=1.bin'

In [None]:
# f_out = open(output_file,'wb')
# pickle.dump((dv,model),f_out)
# f_out.close()

In [39]:
with open(output_file,'wb') as f_out:
    pickle.dump((dv,model),f_out)

In [1]:
import pickle

In [2]:
model_file = 'model_C=1.bin'

In [3]:
with open(model_file,'rb') as f_in:
    dv,model = pickle.load(f_in)

In [4]:
dv,model

(DictVectorizer(sparse=False), LogisticRegression(C=0.5, max_iter=1000))

In [33]:
customer = {
 'gender':  'male',
 'seniorcitizen':  0,
 'partner': 'yes',
 'dependents': 'yes',
 'tenure': 12,
 'phoneservice': 'yes',
 'multiplelines':'no',
 'internetservice':  'no',
 'onlinesecurity':  'no',
 'onlinebackup':'no',
 'deviceprotection':  'no',
 'techsupport': 'no',
 'streamingtv': 'no',
 'streamingmovies':'no',
 'contract': 'two_year',
 'paperlessbilling': 'no',
 'paymentmethod': 'mailed_check',
 'monthlycharges': 19.7,
 'totalcharges': 258.35,
 'churn': 0}

In [34]:
X = dv.transform([customer])

In [35]:
model.predict_proba(X)[0,1]

0.09591633613943829