In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

import pickle

## Data Preprocessing

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(url)

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = df.select_dtypes(exclude='number').columns

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)

df.drop('customerid', axis=1, inplace=True)

In [3]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

## Model Training

In [4]:
# def one_hot_encoding(df, dv=None):
# 	dicts = df.to_dict(orient='records')
#
# 	if not dv:
# 		dv = DictVectorizer(sparse=False, dtype=np.int32)
# 		dv.fit(dicts)
#
# 	X = dv.transform(dicts)
#
# 	return X, dv

In [5]:
def one_hot_encoding(df, dv=None):
	df.reset_index(drop=True, inplace=True)
	df_categorical = df.select_dtypes(exclude='number')
	df_numerical = df.select_dtypes('number')

	if not dv:
		dv = OneHotEncoder(sparse_output=False, dtype=np.int32)
		dv.fit(df_categorical)

	df_encoded = pd.DataFrame(
		data=dv.transform(df_categorical),
		columns=dv.get_feature_names_out())

	X = pd.concat([df_encoded, df_numerical], ignore_index=True, axis=1)

	return X.values, dv

In [6]:
def train(df, y, C=1.0):
	# encode the data
	X, dv = one_hot_encoding(df)

	# train a model
	model = LogisticRegression(C=C, max_iter=10000)
	model.fit(X, y)

	# return encoder and model
	return dv, model

In [7]:
def predict(df, dv, model):
	# encode the data
	X, _ = one_hot_encoding(df, dv)

	# generate prediction probabilities
	pred = model.predict_proba(X)[:, 1]

	return pred

In [8]:
n_splits = 5
C = 1.0

In [9]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
	# randomly split training and validation sets
	df_train = df_full_train.iloc[train_idx]
	df_val = df_full_train.iloc[val_idx]

	y_train = df_train.churn.values
	y_val = df_val.churn.values

	df_train = df_train.drop('churn', axis=1)
	df_val = df_val.drop('churn', axis=1)

    # train and validate the model
	dv, model = train(df_train, y_train, C=C)
	y_pred = predict(df_val, dv, model)

	auc = roc_auc_score(y_val, y_pred)
	scores.append(auc)

print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

C=1.0 0.842 +- 0.007


In [10]:
scores

[0.8443532117653525,
 0.8451343365117203,
 0.8332811559301626,
 0.8347529097653001,
 0.8518363088701326]

## Model Validation

In [11]:
y_full_train = df_full_train.churn.values
y_test = df_test.churn.values

df_full_train = df_full_train.drop('churn', axis=1)
df_test = df_test.drop('churn', axis=1)

In [12]:
dv, model = train(df_full_train, y_full_train, C=1.0)

In [13]:
y_pred = predict(df_test, dv, model)
auc = roc_auc_score(y_test, y_pred)
auc

0.8584898761740714

## Model Saving

In [14]:
output_file = f'model_C={C}.bin'

In [15]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

In [16]:
input_file = 'model_C=1.0.bin'

In [17]:
with open(input_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [21]:
# X = dv.transform(df_test.iloc[[170]])
X, _ = one_hot_encoding(df_test.iloc[[170]], dv)

In [22]:
y_pred = model.predict_proba(X)[0, 1]

In [23]:
print('prediction probabilities:', y_pred)

prediction probabilities: 0.7102842349018084
