In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Collecting kaggle (from opendatasets)
  Downloading kaggle-1.5.16.tar.gz (83 kB)
     ---------------------------------------- 0.0/83.6 kB ? eta -:--:--
     -------------- ------------------------- 30.7/83.6 kB 1.3 MB/s eta 0:00:01
     -------------------------------------  81.9/83.6 kB 762.6 kB/s eta 0:00:01
     -------------------------------------- 83.6/83.6 kB 520.2 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py): started
  Building wheel for kaggle (setup.py): finished with status 'done'
  Created wheel for kaggle: filename=kaggle-1.5.16-py3-none-any.whl size=110697 sha256=cb5d6b70eca55a1a785ae59b476cb48cc3cfd532a9f4f66dfa53d7235f1a84d8
  Stored in directory: c:\users\work\appdata\local\pip\cache\wheels\6a\2b\d0\457dd27de499e9423caf738e743c4a

In [3]:
import opendatasets as od

In [4]:
od.download('https://www.kaggle.com/datasets/blastchar/telco-customer-churn')

Downloading telco-customer-churn.zip to .\telco-customer-churn


100%|████████████████████████████████████████████████████████████████████████████████| 172k/172k [00:00<00:00, 486kB/s]







In [10]:
df = pd.read_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [11]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)




In [12]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [13]:
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)


In [14]:
df_full_train, df_test = train_test_split(df,  test_size=0.2, random_state=1)

In [21]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [22]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [23]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']

In [24]:
def train (df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(C = C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model


In [25]:
def predict(df, dv,model):
    dicts = df[categorical+numerical].to_dict(orient = 'records')
    
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:,1]
    
    return y_pred



In [26]:
C = 1.0
n_splits = 5

In [27]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train =df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
    
    y_train = df_train.churn.values
    y_val = df_val.churn.values
    
    dv,model = train(df_train, y_train, C=C)
    y_pred =predict(df_val, dv, model)
    
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
    
    print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

C=1.0 0.842 +- 0.000
C=1.0 0.844 +- 0.002
C=1.0 0.840 +- 0.005
C=1.0 0.838 +- 0.006
C=1.0 0.841 +- 0.008


In [28]:
scores

[0.8420453564223532,
 0.8455854357038802,
 0.8331848284166163,
 0.8301724275756219,
 0.8522402811333187]

In [30]:
dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv, model)


y_test = df_test.churn.values
auc= roc_auc_score(y_test, y_pred)
auc


0.8572386167896259

Save the model

In [31]:
import pickle

In [33]:
output_file = f'model_C={C}.bin'

output_file

'model_C=1.0.bin'

In [36]:
f_out = open(output_file, 'wb')

pickle.dump((dv,model), f_out)

f_out.close()

In [37]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)
    #do stuff
    
#do stuffff