In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Collecting kaggle (from opendatasets)
  Downloading kaggle-1.5.16.tar.gz (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.6/83.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting click (from opendatasets)
  Obtaining dependency information for click from https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl.metadata
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting python-slugify (from kaggle->opendatasets)
  Using cached python_slugify-8.0.1-py2.py3-none-any.whl (9.7 kB)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle->opendatasets)
  Using cached text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Downloading click-8.1.7-py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import opendatasets as od

In [4]:
od.download('https://www.kaggle.com/datasets/blastchar/telco-customer-churn')

Skipping, found downloaded files in "./telco-customer-churn" (use force=True to force download)


In [5]:
df = pd.read_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)




In [7]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [8]:
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)


In [9]:
df_full_train, df_test = train_test_split(df,  test_size=0.2, random_state=1)

In [10]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [11]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [12]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']

In [13]:
def train (df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(C = C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model


In [14]:
def predict(df, dv,model):
    dicts = df[categorical+numerical].to_dict(orient = 'records')
    
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:,1]
    
    return y_pred



In [15]:
C = 1.0
n_splits = 5

In [16]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train =df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
    
    y_train = df_train.churn.values
    y_val = df_val.churn.values
    
    dv,model = train(df_train, y_train, C=C)
    y_pred =predict(df_val, dv, model)
    
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
    
    print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

C=1.0 0.844 +- 0.000
C=1.0 0.844 +- 0.001
C=1.0 0.841 +- 0.006
C=1.0 0.838 +- 0.007
C=1.0 0.841 +- 0.008


In [17]:
scores

[0.8438508214866044,
 0.8450763971659383,
 0.8327513546056594,
 0.8301724275756219,
 0.8521461516739357]

In [18]:
dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv, model)


y_test = df_test.churn.values
auc= roc_auc_score(y_test, y_pred)
auc


0.8572386167896259

Save the model

In [19]:
dv

In [21]:
numerical+categorical

['tenure',
 'monthlycharges',
 'totalcharges',
 'gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [22]:
customer = {'customer_id': '8879-zkjof','tenure': 41,
 'monthlycharges': 79.85,
 'totalcharges': 3320.75,
 'gender': 'female',
 'seniorcitizen':0,
 'partner': 'no',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'one_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)'}

In [23]:
df = pd.DataFrame([customer])

In [28]:
y_pred = predict(df,dv, model)

In [30]:
y_pred[0]

0.0622429554128498

In [31]:
def predict_single(customer, dv,model):
    X = dv.transform([customer])
    y_pred = model.predict_proba(X)[:,1]
    return y_pred[0]
    

In [32]:
predict_single(customer, dv,model)

0.0622429554128498

In [33]:
import pickle

In [34]:
output_file = f'model_C={C}.bin'

output_file

'model_C=1.0.bin'

In [35]:
f_out = open(output_file, 'wb')

pickle.dump((dv,model), f_out)

f_out.close()

In [36]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)
    #do stuff
    
#do stuffff

In [37]:
model_file = 'model_C=1.0.bin'

In [38]:
with open(model_file, 'rb') as f_in:
    (dv,model) = pickle.load(f_in)
    #do stuff
    
#do stuffff