In [65]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold


from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import pickle

In [66]:
churn = pd.read_csv("customer-churn.csv")
churn.columns = churn.columns.str.lower().str.replace(" ", "_")

categorical_columns = churn.dtypes[churn.dtypes == "O"].index

for c in categorical_columns:
    churn[c] = churn[c].str.lower().str.replace(" ", "_")

churn.totalcharges = pd.to_numeric(churn.totalcharges, errors="coerce")
churn["totalcharges"] = churn.totalcharges.fillna(0)

churn.churn = churn.churn.apply(lambda churn: churn == "yes" ).astype(int)

df_full_train, df_test =  train_test_split(churn, test_size=0.2, random_state=1)

In [67]:
df_train, df_val =  train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index()
df_test = df_test.reset_index()
df_val = df_val.reset_index()

y_train = df_train.churn
y_val = df_val.churn
y_test = df_test.churn

del df_train["churn"]
del df_val["churn"]
del df_test["churn"]

numerical = ["tenure", "monthlycharges", "totalcharges"]
categorical = [
    'onlinesecurity', 'deviceprotection', 'gender', 
    'partner', 'paymentmethod', 'paperlessbilling', 
    'seniorcitizen', 'techsupport', 'phoneservice', 
    'dependents', 'onlinebackup', 'contract', 
    'multiplelines', 'streamingmovies', 
    'internetservice', 'streamingtv'
]


In [68]:
def train_model(df_train, y_train, c=0.001):
    train_dicts = df_train[categorical + numerical].to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)

    model = LogisticRegression(max_iter=1000, C=c, solver="liblinear")
    model.fit(X_train, y_train)
    
    return dv, model

def predict(df, dv, model):
    df_dicts = df[categorical + numerical].to_dict(orient="records")

    X = dv.fit_transform(df_dicts)

    y_pred = model.predict_proba(X)[:,1]

    return y_pred



In [69]:
C=1.0
n_splits=5

In [70]:
scores = []

kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
    
    dv, model = train_model(df_train, df_train.churn, c=C)
    
    y_pred = predict(df_val, dv, model)

    score = roc_auc_score(df_val.churn, y_pred)
    
    scores.append(score)

print(f"C={C}: {np.mean(scores):.2f} +/- {np.std(scores):.2f}")

C=1.0: 0.84 +/- 0.01


In [71]:
dv, model = train_model(df_full_train, df_full_train.churn, c=100)
y_pred = predict(df_test, dv, model)
score = roc_auc_score(y_test, y_pred)

## saving the model

In [72]:
output_file = f"model_C={C}.bin"
with open(output_file, "wb") as f_out:
    pickle.dump((dv,model), f_out)

## reading a model

In [73]:
sample_data = df_full_train.sample()[categorical + numerical].to_dict(orient="records")

In [74]:
sample_data

[{'onlinesecurity': 'no',
  'deviceprotection': 'yes',
  'gender': 'female',
  'partner': 'no',
  'paymentmethod': 'mailed_check',
  'paperlessbilling': 'yes',
  'seniorcitizen': 0,
  'techsupport': 'no',
  'phoneservice': 'yes',
  'dependents': 'no',
  'onlinebackup': 'yes',
  'contract': 'one_year',
  'multiplelines': 'no',
  'streamingmovies': 'no',
  'internetservice': 'dsl',
  'streamingtv': 'no',
  'tenure': 26,
  'monthlycharges': 56.05,
  'totalcharges': 1553.2}]

In [75]:
input_file = f"model_C={C}.bin"
with open(input_file, "rb") as f_in:
    dv, model = pickle.load(f_in)

In [76]:
X = dv.transform(sample_data)
X

array([[0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
        1.0000e+00, 0.0000e+00, 0.0000e+00, 5.6050e+01, 1.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
        1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
        1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
        1.0000e+00, 0.0000e+00, 0.0000e+00, 2.6000e+01, 1.5532e+03]])

In [77]:
model.predict_proba(X)[0,1]

0.09352991114422352

In [78]:
import requests

In [79]:
url = "http://localhost:9696/predict"

In [80]:
customer = sample_data[0]
customer

{'onlinesecurity': 'no',
 'deviceprotection': 'yes',
 'gender': 'female',
 'partner': 'no',
 'paymentmethod': 'mailed_check',
 'paperlessbilling': 'yes',
 'seniorcitizen': 0,
 'techsupport': 'no',
 'phoneservice': 'yes',
 'dependents': 'no',
 'onlinebackup': 'yes',
 'contract': 'one_year',
 'multiplelines': 'no',
 'streamingmovies': 'no',
 'internetservice': 'dsl',
 'streamingtv': 'no',
 'tenure': 26,
 'monthlycharges': 56.05,
 'totalcharges': 1553.2}

In [81]:
requests.post(url=url, json=customer).json()

{'churn': False, 'churn_probability': 0.09352991114422352}