In [4]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [5]:
data="customer.csv"
df=pd.read_csv(data)

df.columns=df.columns.str.lower().str.replace(" ","_")

categorical_columns=list(df.dtypes[df.dtypes=="object"].index)

for c in categorical_columns: 
    df[c]=df[c] .str.lower().str.replace(" ","_")

df.totalcharges=pd.to_numeric(df.totalcharges, errors="coerce")
df.totalcharges=df.totalcharges.fillna(0)
df.churn=(df.churn =="yes").astype(int)    

In [6]:
df_full_train,df_test=train_test_split(df,test_size=0.2,random_state=1)

In [7]:
numerical =['tenure','monthlycharges', 'totalcharges']
categorical =['gender', 'seniorcitizen', 'partner', 'dependents','phoneservice', 
            'multiplelines', 'internetservice','onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport','streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
            'paymentmethod']

In [8]:
def training(df_train,y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient="records")

    dv=DictVectorizer(sparse=False)
    X_train=dv.fit_transform(dicts)

    model=LogisticRegression(C=C,max_iter=10000)
    model.fit(X_train,y_train)

    return dv,model

In [9]:
def predict(df,dv,model):
    dicts = df[categorical + numerical].to_dict(orient="records")
    X=dv.transform(dicts)

    y_pred=model.predict_proba(X)[:,1]


    return y_pred

C=0.01
n_splits=5
kfold= KFold(n_splits=n_splits,shuffle=True,random_state=2)
scores = []

for train_idx,val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model = training(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))    

scores

In [12]:
dv, model = training(df_full_train, df_full_train.churn.values, C=0.01)
y_pred = predict(df_test, dv, model)

In [13]:
y_test=df_test.churn.values
auc=roc_auc_score(y_test,y_pred)
auc

0.8548119319228228

### save the model

In [14]:
import pickle

In [15]:
output_file=f"model_C={C}.bin"
output_file

'model_C=0.01.bin'

In [16]:
#f_out=open("output_file","wb")
#pickle.dump((dv,model),f_out)
#f_out.close()

In [17]:
with open(output_file,"wb") as f_out:
    pickle.dump((dv,model),f_out)

### Load the model 

In [18]:
import pickle

In [19]:
input_file='model_C=0.01.bin'

In [21]:
with open(input_file,"rb") as f_in:
    dv,model=pickle.load(f_in)

In [22]:
dv,model

(DictVectorizer(sparse=False), LogisticRegression(C=0.01, max_iter=10000))