## CHURN PREDICTION

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import pickle

## Data Preparation

In [2]:
df = pd.read_csv('telco-customer-churn.csv')

In [3]:
df.columns = df.columns.str.lower()

In [4]:
cat_cols = list(df.dtypes[df.dtypes == 'object'].index)
cat_cols

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'totalcharges',
 'churn']

In [5]:
for c in cat_cols:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [6]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [7]:
df.totalcharges = df.totalcharges.fillna(0)

In [8]:
df['churn'] = (df.churn == 'yes').astype(int)

In [9]:
num_features = ['tenure', 'monthlycharges', 'totalcharges']

In [10]:
cat_features = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

## Model Training

In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [13]:
y_full_train = df_full_train.churn.values
y_test = df_test.churn.values

In [14]:
print(df_full_train.shape, y_full_train.shape)
print(df_test.shape, y_test.shape)

(5634, 21) (5634,)
(1409, 21) (1409,)


In [15]:
dv = DictVectorizer(sparse=False)

In [16]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[cat_features + num_features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=3500)
    model.fit(X_train, y_train)
    
    return dv, model

In [17]:
def predict(df, dv, model):
    dicts = df[cat_features + num_features].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [18]:
version = 1.0
n_splits = 5
C = 1.0

kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
scores = []
fold = 0

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

    print(f'auc on fold {fold} is {auc}')
    fold = fold + 1

print('validation results:')
print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

auc on fold 0 is 0.8349916208579917
auc on fold 1 is 0.8460807956623164
auc on fold 2 is 0.8384364181010142
auc on fold 3 is 0.848852051120482
auc on fold 4 is 0.8348773175059657
validation results:
C=1.0 0.841 +- 0.006


## Evaluation

In [19]:
dv, model = train(df_full_train, y_full_train, C=1.0)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
auc

0.8623663399132568

## Model Saving

In [20]:
output_file = f'model_{version}.bin'

In [21]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv,model), f_out)

In [22]:
saved_model = 'model_1.0.bin'

In [23]:
with open(saved_model, 'rb') as f_in:
    dv,model = pickle.load(f_in)

## Prediction

In [24]:
customer = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 5,
    'monthlycharges': 29.85,
    'totalcharges': 29.85
}

In [25]:
def predict_cust(customer, dv, model):
    cust = dv.transform([customer])
    prediction = model.predict_proba(cust)

    return prediction[0,1]

In [26]:
prediction = predict_cust(customer, dv, model)
print(prediction)

0.6037115135903369
