In [55]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score, roc_auc_score
from sklearn.model_selection import KFold, train_test_split

In [56]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.info()
df.SeniorCitizen.value_counts()
df.Partner.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


No     3641
Yes    3402
Name: Partner, dtype: int64

In [57]:
total_charges = pd.to_numeric(df.TotalCharges, errors="coerce")
df[total_charges.isnull()][["customerID", "TotalCharges"]]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [58]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors="coerce") #coerce is to skip non-numeric data such as spaces
df.TotalCharges = df.TotalCharges.fillna(0)

In [59]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
string_columns = list(df.dtypes[df.dtypes == "object"].index)

In [60]:
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(" ", "_")

In [61]:
df.churn = (df.churn == "yes").astype(int)

df is the all dataset and it is divided to df_train_full and df_test

In [62]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

then the train data set is divided to train and validation

In [63]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state = 11)

Exploratory Data Analysis

In [64]:
df_train_full.isnull().sum()
df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [65]:
global_mean = round(df_train_full.churn.mean(), 3)

In [66]:
categorical = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

numerical = ["tenure", "monthlycharges", "totalcharges"]
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [67]:
female_mean = df_train_full[df_train_full.gender == "female"].churn.mean() #mean = the percentage of churn females/all females
male_mean = df_train_full[df_train_full.gender == "male"].churn.mean()

In [68]:
partner_yes = df_train_full[df_train_full.partner == "yes"].churn.mean()
partner_no = df_train_full[df_train_full.partner == "no"].churn.mean()

In [69]:
df_group = df_train_full.groupby(by="gender").churn.agg(["mean"])
df_group["diff"] = df_group["mean"] - global_mean
df_group["risk"] = df_group["mean"] / global_mean

risk is close to 1: this people has same risk as anyone else. Close to 1 is not risky. <br>
risk 0.5 means that this group of people is two times less likely to churn compared to rest<br>
risk is over 1: this group has more churn inside than the rest. more likely to churn 

In [70]:
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(["mean"])
    df_group["diff"] = df_group["mean"] - global_mean
    df_group["risk"] = df_group["mean"] / global_mean
    print(df_group)

            mean      diff      risk
gender                              
female  0.276824  0.006824  1.025274
male    0.263214 -0.006786  0.974865
                   mean      diff      risk
seniorcitizen                              
0              0.242270 -0.027730  0.897297
1              0.413377  0.143377  1.531027
             mean      diff      risk
partner                              
no       0.329809  0.059809  1.221515
yes      0.205033 -0.064967  0.759383
                mean      diff      risk
dependents                              
no          0.313760  0.043760  1.162074
yes         0.165666 -0.104334  0.613579
                  mean      diff      risk
phoneservice                              
no            0.241316 -0.028684  0.893764
yes           0.273049  0.003049  1.011292
                      mean      diff      risk
multiplelines                                 
no                0.257407 -0.012593  0.953361
no_phone_service  0.241316 -0.028684  0.893764


In [71]:
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

higher mutual info = higher degree of dependence

In [72]:
df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name="MI")

for numerical values correlation is applied instead of mutual info

In [73]:
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [74]:
train_dict = df_train[categorical + numerical].to_dict(orient="records")
# records is any column in the data set
# column churn is the target variable, so it is not considered.

In [75]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict) #converts dictionaries to matrix
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [76]:
y_train = df_train["churn"].values

logistic model is like a linear but in linear we could predict the price of a car(numeric), here it is binary (churn =1, not churn = 0)<br>
sigmoid function maps any value between zero and one

In [77]:
model = LogisticRegression(solver="liblinear", random_state=1)
model.fit(X_train, y_train)
# model is trained

Check how good model works, use validation data set

In [78]:
val_dict = df_val[categorical + numerical].to_dict(orient="records")
y_val = df_val["churn"].values
X_val = dv.transform(val_dict)

In [79]:
y_pred = model.predict_proba(X_val)[:, 1] #takes only churn values(Second column)

In [80]:
churn = y_pred >= 0.5
(y_val == churn).mean() # 80 percent predicted correct = accuracy

0.8016129032258065

In [81]:
model.intercept_[0]
model.coef_[0]

array([ 5.63352013e-01, -8.59109588e-02, -5.99429690e-01, -3.02784218e-02,
       -9.17102141e-02,  9.99297896e-02, -1.15869912e-01, -1.06048514e-01,
       -2.73675450e-02, -9.46210909e-02, -3.23345457e-01,  3.17226733e-01,
       -1.15869912e-01,  7.84207767e-04, -1.68099103e-01,  1.27132808e-01,
       -8.10223411e-02,  1.35702678e-01, -1.15869912e-01, -1.41821402e-01,
        2.57855836e-01, -1.15869912e-01, -2.63974560e-01, -2.12616482e-01,
        9.06278461e-02, -4.80213556e-02, -7.39672803e-02, -2.66756128e-02,
       -1.36243725e-01,  1.74743449e-01, -1.33812747e-01,  1.27132808e-01,
       -2.49121444e-01,  2.97088422e-01, -8.48576290e-02, -1.15869912e-01,
        7.87389051e-02, -9.90736625e-02, -1.15869912e-01,  9.29549385e-02,
        1.78137960e-01, -1.15869912e-01, -1.84256684e-01, -6.94867353e-02,
        4.47688438e-04])

In [82]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


In [83]:
small_subset = ["contract", "tenure", "totalcharges"]
train_dict_small = df_train[small_subset].to_dict(orient="records")
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)
X_small_train = dv_small.transform(train_dict_small)

In [84]:
dv_small.get_feature_names_out()
model_small = LogisticRegression(solver="liblinear", random_state=1)
model_small.fit(X_small_train, y_train) # amount of data is the same(rows), only columns are less. y_train = real churn values for all

validation set for small

In [85]:
val_dict_small = df_val[small_subset].to_dict(orient="records")
y_val = df_val["churn"].values
X_val_small = dv_small.transform(val_dict_small)

In [86]:
y_pred_small = model_small.predict_proba(X_val_small)[:, 1] #takes only churn values(Second column)
churn_small = y_pred_small >= 0.5
(y_val == churn_small).mean() 

0.7672043010752688

In [87]:
model_small.intercept_[0]

-0.577229912199359

In [88]:
dict(zip(dv_small.get_feature_names_out(), model_small.coef_[0].round(3)))

{'contract=month-to-month': 0.866,
 'contract=one_year': -0.327,
 'contract=two_year': -1.117,
 'tenure': -0.094,
 'totalcharges': 0.001}

In [89]:
customer = {
    "customerid": "8879-zkjof",
    "gender": "female",
    "seniorcitizen": 0,
    "partner": "no",
    "dependents": "no",
    "tenure": 41,
    "phoneservice": "yes",
    "multiplelines": "no",
    "internetservice": "dsl",
    "onlinesecurity": "yes",
    "onlinebackup": "no",
    "deviceprotection": "yes",
    "techsupport": "yes",
    "streamingtv": "yes",
    "streamingmovies": "yes",
    "contract": "one_year",
    "paperlessbilling": "yes",
    "paymentmethod": "bank_transfer_(automatic)",
    "monthlycharges": 79.85,
    "totalcharges": 3320.75,
}

In [90]:
X_test = dv.transform([customer]) #uses DictVectorizer: converts dict to matrix
model.predict_proba(X_test)[:, 1] #churn possibility

array([0.07332239])

In [91]:
customer = {
    "gender": "female",
    "seniorcitizen": 1,
    "partner": "no",
    "dependents": "no",
    "phoneservice": "yes",
    "multiplelines": "yes",
    "internetservice": "fiber_optic",
    "onlinesecurity": "no",
    "onlinebackup": "no",
    "deviceprotection": "no",
    "techsupport": "no",
    "streamingtv": "yes",
    "streamingmovies": "no",
    "contract": "month-to-month",
    "paperlessbilling": "yes",
    "paymentmethod": "electronic_check",
    "tenure": 1,
    "monthlycharges": 85.7,
    "totalcharges": 85.7,
}

In [92]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1] #0.83 = most probably churn

0.8321649550627843

In [93]:
def train(df, y):
    cat = df[categorical + numerical].to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    dv.fit(cat)
    X = dv.transform(cat)
    model = LogisticRegression(solver="liblinear")
    model.fit(X, y)
    return dv, model

In [94]:
def predict(df, dv, model):
    cat = df[categorical + numerical].to_dict(orient="records")
    X = dv.transform(cat)
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred

In [95]:
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
aucs = []

In this way two variables iterates over df_train_full, it doesn't overlap

In [96]:
for train_idx, val_idx in kfold.split(df_train_full):
    df_train = df_train_full.iloc[train_idx]
    df_val = df_train_full.iloc[val_idx]
    y_train = df_train.churn.values
    y_val = df_val.churn.values
    
    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)
    auc = roc_auc_score(y_val, y_pred)
    aucs.append(auc)

In [97]:
print("auc = %0.3f Â± %0.3f" % (np.mean(aucs), np.std(aucs)))

auc = 0.842 Â± 0.012


In [98]:
def train(df, y, C):
    cat = df[categorical + numerical].to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    dv.fit(cat)
    X = dv.transform(cat)
    model = LogisticRegression(solver="liblinear", C=C)
    model.fit(X, y)
    return dv, model

In [99]:
nfolds = 10
kfold = KFold(n_splits=nfolds, shuffle=True, random_state=1)

In [100]:
for C in [0.001, 0.01, 0.1, 0.5, 1, 10]:
    aucs = []
    for train_idx, val_idx in kfold.split(df_train_full):
        df_train = df_train_full.iloc[train_idx]
        df_val = df_train_full.iloc[val_idx]
        y_train = df_train.churn.values
        y_val = df_val.churn.values
        dv, model = train(df_train, y_train, C)
        y_pred = predict(df_val, dv, model)
        auc = roc_auc_score(y_val, y_pred)
        aucs.append(auc)
    print("C=%s, auc = %0.3f Â± %0.3f" % (C, np.mean(aucs), np.std(aucs)))

C=0.001, auc = 0.826 Â± 0.016
C=0.01, auc = 0.840 Â± 0.012
C=0.1, auc = 0.842 Â± 0.012
C=0.5, auc = 0.842 Â± 0.012
C=1, auc = 0.842 Â± 0.012
C=10, auc = 0.842 Â± 0.012


Then c parameter is small, the model is more regularized = weights are smaller = model will behave the same on a real data set

In [101]:
y_train = df_train_full.churn.values
y_test = df_test.churn.values

In [102]:
dv, model = train(df_train_full, y_train, C=0.5)
y_pred = predict(df_test, dv, model)

In [103]:
auc = roc_auc_score(y_test, y_pred)
print(auc)

0.8579752889813339


In [104]:
customer = {
    "customerid": "8879-zkjof",
    "gender": "female",
    "seniorcitizen": 0,
    "partner": "no",
    "dependents": "no",
    "tenure": 41,
    "phoneservice": "yes",
    "multiplelines": "no",
    "internetservice": "dsl",
    "onlinesecurity": "yes",
    "onlinebackup": "no",
    "deviceprotection": "yes",
    "techsupport": "yes",
    "streamingtv": "yes",
    "streamingmovies": "yes",
    "contract": "one_year",
    "paperlessbilling": "yes",
    "paymentmethod": "bank_transfer_(automatic)",
    "monthlycharges": 79.85,
    "totalcharges": 3320.75,
}

In [106]:
df = pd.DataFrame([customer])
y_pred = predict(df, dv, model)
y_pred[0]

0.05960552403272276

In [107]:
def predict_single(customer, dv, model):
    X = dv.transform([customer])
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred[0]

In [108]:
predict_single(customer, dv, model)

0.05960552403272276

In [109]:
with open("churn-model.bin", "wb") as f_out:
    pickle.dump((dv, model), f_out)