In [11]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [12]:
data="customer.csv"
df=pd.read_csv(data)

df.columns=df.columns.str.lower().str.replace(" ","_")

categorical_columns=list(df.dtypes[df.dtypes=="object"].index)

for c in categorical_columns: 
    df[c]=df[c] .str.lower().str.replace(" ","_")

df.totalcharges=pd.to_numeric(df.totalcharges, errors="coerce")
df.totalcharges=df.totalcharges.fillna(0)
df.churn=(df.churn =="yes").astype(int)    

In [13]:
df_full_train,df_test=train_test_split(df,test_size=0.2,random_state=1)

In [14]:
numerical =['tenure','monthlycharges', 'totalcharges']
categorical =['gender', 'seniorcitizen', 'partner', 'dependents','phoneservice', 
            'multiplelines', 'internetservice','onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport','streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
            'paymentmethod']

In [15]:
def training(df_train,y_train, C):
    dicts = df_train[categorical + numerical].to_dict(orient="records")

    dv=DictVectorizer(sparse=False)
    X_train=dv.fit_transform(dicts)

    model=LogisticRegression(C=C,max_iter=10000)
    model.fit(X_train,y_train)

    return dv,model

In [16]:
def predict(df,dv,model):
    dicts = df[categorical + numerical].to_dict(orient="records")
    X=dv.transform(dicts)

    y_pred=model.predict_proba(X)[:,1]


    return y_pred

In [17]:
C=0.01
dv, model = training(df_full_train, df_full_train.churn.values, C)
y_pred = predict(df_test, dv, model)

In [18]:
y_test=df_test.churn.values
auc=roc_auc_score(y_test,y_pred)
auc

0.8548119319228228

### save the model

In [19]:
import pickle

In [20]:
output_file=f"model_C={C}.bin"
output_file

'model_C=0.01.bin'

In [22]:
with open(output_file,"wb") as f_out:
    pickle.dump((dv,model),f_out)

### Load the model 

In [23]:
import pickle

In [24]:
input_file='model_C=0.01.bin'

In [25]:
with open(input_file,"rb") as f_in:
    dv,model=pickle.load(f_in)

In [26]:
dv,model

(DictVectorizer(sparse=False), LogisticRegression(C=0.01, max_iter=10000))

In [43]:
customer={'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'no',
 'phoneservice': 'no',
 'multiplelines': 'no_phone_service',
 'internetservice': 'dls',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'no',
 'streamingtv': 'no',
 'streamingmovies': 'no',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 20,
 'monthlycharges': 29.85,
 'totalcharges': 29.85}

In [44]:
X_0=dv.transform([customer])
model.predict_proba(X_0)[0,1]

np.float64(0.29053376379553675)