In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

import pickle

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/Users/bastienwinant/Desktop/projects/ml-zoomcamp/.venv/lib/python3.9/site-packages/traitlets/traitlets.py", line 632, in get
    value = obj._trait_values[self.name]
KeyError: '_control_lock'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/bastienwinant/Desktop/projects/ml-zoomcamp/.venv/lib/python3.9/site-packages/zmq/eventloop/zmqstream.py", line 565, in _log_error
    f.result()
  File "/Users/bastienwinant/Desktop/projects/ml-zoomcamp/.venv/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 301, in dispatch_control
    async with self._control_lock:
  File "/Users/bastienwinant/Desktop/projects/ml-zoomcamp/.venv/lib/python3.9/site-packages/traitlets/traitlets.py", line 687, in __get__
    return t.cast(G, self.get(obj, cls))  # the G should encode the Optional
  File "/Users/bastienwinant/Desktop/p

## Data Preprocessing

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(url)

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = df.select_dtypes(exclude='number').columns

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)

In [3]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [4]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

## Model Training

In [5]:
def one_hot_encoding(df, enc=None):
	df.reset_index(drop=True, inplace=True)

	# separate numerical and categorical features
	df_numerical = df.select_dtypes('number')
	df_categorical = df.select_dtypes(exclude='number')

	# fit a new encoder on the categorical data
	if not enc:
		enc = OneHotEncoder(dtype=np.int64, sparse_output=False, handle_unknown='ignore')
		enc.fit(df_categorical)

	df_encoded = pd.DataFrame(
		data=enc.transform(df_categorical),
		columns=enc.get_feature_names_out())

	# combine encodings and numerical features
	X = pd.concat([df_numerical, df_encoded], axis=1)

	return X.values, enc

In [6]:
def train(df, y, C=1.0):
	# encode the data
	X, enc = one_hot_encoding(df)

	# train a model
	model = LogisticRegression(C=C, max_iter=10000)
	model.fit(X, y)

	# return encoder and model
	return enc, model

In [7]:
def predict(df, enc, model):
	# encode the data
	X, _ = one_hot_encoding(df, enc)

	# generate prediction probabilities
	y_pred = model.predict_proba(X)[:, 1]

	return y_pred

In [8]:
n_splits = 5
C = 1.0

In [9]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
	df_train = df_full_train.iloc[train_idx]
	df_val = df_full_train.iloc[val_idx]

	y_train = df_train.churn.values
	y_val = df_val.churn.values

	df_train = df_train.drop('churn', axis=1)
	df_val = df_val.drop('churn', axis=1)

	enc, model = train(df_train, y_train, C=C)
	y_pred = predict(df_val, enc, model)

	auc = roc_auc_score(y_val, y_pred)
	scores.append(auc)

print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

C=1.0 0.842 +- 0.007


In [10]:
scores

[0.8446711306136225,
 0.8451136438882266,
 0.833241019466185,
 0.8344451461277739,
 0.851142104107182]

## Model Validation

In [11]:
y_full_train = df_full_train.churn.values
y_test = df_test.churn.values

df_full_train = df_full_train.drop('churn', axis=1)
df_test = df_test.drop('churn', axis=1)

In [12]:
enc, model = train(df_full_train, y_full_train, C=1.0)

In [13]:
y_pred = predict(df_test, enc, model)
auc = roc_auc_score(y_test, y_pred)
auc

0.85836258355271

## Model Saving

In [14]:
output_file = f'model_C={C}.bin'

In [15]:
with open(output_file, 'wb') as f_out:
    pickle.dump((enc, model), f_out)

In [16]:
input_file = 'model_C=1.0.bin'

In [17]:
with open(input_file, 'rb') as f_in:
    enc, model = pickle.load(f_in)

In [18]:
customer = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': 29.85
}

In [31]:
X, _ = one_hot_encoding(df_test.iloc[[40]], enc)

In [32]:
y_pred = model.predict_proba(X)[0, 1]

In [33]:
print('input:', customer)
print('output:', y_pred)

input: {'gender': 'female', 'seniorcitizen': 0, 'partner': 'yes', 'dependents': 'no', 'phoneservice': 'no', 'multiplelines': 'no_phone_service', 'internetservice': 'dsl', 'onlinesecurity': 'no', 'onlinebackup': 'yes', 'deviceprotection': 'no', 'techsupport': 'no', 'streamingtv': 'no', 'streamingmovies': 'no', 'contract': 'month-to-month', 'paperlessbilling': 'yes', 'paymentmethod': 'electronic_check', 'tenure': 1, 'monthlycharges': 29.85, 'totalcharges': 29.85}
output: 0.0036756198085913534
