# Evaluation metrics

Overview of different evaluation metrics that can be used with different models.

In [41]:
# import necessary dependencies:
import pandas as pd
import numpy as np

# Formatting output display
from IPython.display import display

# Plotting
import plotly.graph_objs as go
import plotly.offline as py

# Data validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

# One hot encoding
from sklearn.feature_extraction import DictVectorizer

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Accuracy
from sklearn.metrics import accuracy_score


In [42]:
# Dataset details - saved directly from kaggle
df = pd.read_csv('churn_data.csv')
df.head(10)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [43]:
df.iloc[:3]


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


## Data cleaning

Clean and preprocess the data

In [44]:
# Preprocess the column names - all lowercase
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns


Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [45]:
# Process the string columns:
columns_with_strings = list(df.dtypes[df.dtypes == 'object'].index)

# Correct all the lower case:
for column in columns_with_strings:
    df[column] = df[column].str.lower().str.replace(' ', '_')

df.tail(10)


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
7033,9767-fflem,male,0,no,no,38,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,credit_card_(automatic),69.5,2625.25,no
7034,0639-tsiqw,female,0,no,no,67,yes,yes,fiber_optic,yes,...,yes,no,yes,no,month-to-month,yes,credit_card_(automatic),102.95,6886.25,yes
7035,8456-qdavc,male,0,no,no,19,yes,no,fiber_optic,no,...,no,no,yes,no,month-to-month,yes,bank_transfer_(automatic),78.7,1495.1,no
7036,7750-eyxwz,female,0,no,no,12,no,no_phone_service,dsl,no,...,yes,yes,yes,yes,one_year,no,electronic_check,60.65,743.3,no
7037,2569-wgero,female,0,no,no,72,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,yes,bank_transfer_(automatic),21.15,1419.4,no
7038,6840-resvb,male,0,yes,yes,24,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,yes,mailed_check,84.8,1990.5,no
7039,2234-xaduh,female,0,yes,yes,72,yes,yes,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,credit_card_(automatic),103.2,7362.9,no
7040,4801-jzazl,female,0,yes,yes,11,no,no_phone_service,dsl,yes,...,no,no,no,no,month-to-month,yes,electronic_check,29.6,346.45,no
7041,8361-ltmkd,male,1,yes,no,4,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,yes,mailed_check,74.4,306.6,yes
7042,3186-ajiek,male,0,no,no,66,yes,no,fiber_optic,yes,...,yes,yes,yes,yes,two_year,yes,bank_transfer_(automatic),105.65,6844.5,no


In [46]:
# Total charges column is of string type, bu should be numeric
total_charges = pd.to_numeric(df['totalcharges'], errors='coerce')

# Check corresponding customer ids for which totalcharges are null
df[total_charges.isnull()][['customerid', 'totalcharges']]


Unnamed: 0,customerid,totalcharges
488,4472-lvygi,_
753,3115-czmzd,_
936,5709-lvoeq,_
1082,4367-nuyao,_
1340,1371-dwpaz,_
3331,7644-omvmy,_
3826,3213-vvolg,_
4380,2520-sgtta,_
5218,2923-arzlg,_
6670,4075-wkniu,_


In [47]:
# Fill the values using zerofill
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')

df['totalcharges'] = df['totalcharges'].fillna(0)


## Processing categorical data

Categorical data can be processed by:
* Assigning values to each category (for binary - `[0, 1]`, for parameters with few categories - few integer values)
* Convert into long form table - binarization for each category - recommended if `no_of_categories` < 5
* Custom encoding


In [48]:
# Converting the churn into binary - 0 for no, 1 otherwise.
df['churn'] = df['churn'].apply(lambda val: val == 'yes').astype(int)
df['churn'].head(10)


0    0
1    0
2    1
3    0
4    1
5    1
6    0
7    0
8    1
9    0
Name: churn, dtype: int32

## Validation framework

Validation framework setup using `scikit-learn`

In [49]:
# Set up test data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

print("Length of the training set (sans validation set): {}\nLength of the test set: {}".format(
    len(df_full_train), len(df_test)))


Length of the training set (sans validation set): 5634
Length of the test set: 1409


In [50]:
# Set up validation data
df_train, df_val = train_test_split(
    df_full_train, test_size=0.25, random_state=1)
print("Length of training data: {}\nLength of validation data: {}".format(
    len(df_train), len(df_val)))
df_train.head(10)


Length of training data: 4225
Length of validation data: 1409


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
3897,8015-ihcgw,female,0,yes,yes,72,yes,yes,fiber_optic,yes,...,yes,yes,yes,yes,two_year,yes,electronic_check,115.5,8425.15,0
1980,1960-uycnn,male,0,no,no,10,yes,yes,fiber_optic,no,...,yes,no,no,yes,month-to-month,yes,electronic_check,95.25,1021.55,0
6302,9250-wypll,female,0,no,no,5,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,no,electronic_check,75.55,413.65,1
727,6786-obwqr,female,0,yes,yes,5,yes,no,fiber_optic,no,...,no,no,yes,no,month-to-month,yes,electronic_check,80.85,356.1,0
5104,1328-euzhc,female,0,yes,no,18,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,20.1,370.5,0
5387,8676-ooqej,male,0,no,no,4,no,no_phone_service,dsl,no,...,no,yes,no,no,month-to-month,no,electronic_check,30.5,118.4,0
459,1452-voqch,male,0,no,no,1,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,75.1,75.1,0
5023,6653-cbbom,female,0,no,no,1,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.3,70.3,1
6778,5893-kclgt,female,0,no,yes,72,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,yes,mailed_check,19.75,1567.0,0
1176,3992-ywpko,female,0,no,no,6,yes,yes,fiber_optic,no,...,yes,yes,yes,yes,month-to-month,yes,credit_card_(automatic),109.9,669.45,1


`test_size` was set to `0.25` in the previous cell as the split was done on `full_train`, **not on the original dataset**.
20% of 80% = 25%

In [51]:
# Reset indices:
def split_data(data):
    """
    Helper function for:
    1. Resetting index of the dataframe - For code readability
    2. Split the input from output
    """
    data = data.reset_index(drop=True)
    # Separate the output
    output = data['churn'].values
    # delete the columns
    del data['churn']
    return data, output


In [52]:
# Reset index and split input from output

df_train, y_train = split_data(df_train)
df_val, y_val = split_data(df_val)
df_test, y_test = split_data(df_test)
# df_train.head(10)


## Exploratory data analysis

* Handle missing values
* Examine the output column
* Process categorical data

In [53]:
df_full_train = df_full_train.reset_index(drop=True)
df_full_train['churn'].value_counts(normalize=True)


0    0.730032
1    0.269968
Name: churn, dtype: float64

Churn rate - 26.99% - No. of customers who sign up for the product

Also - mean of a binary parameter - %  of observations that map to `True`

In [54]:
# assign columns that are categorical
categorical_columns = [
    column for column in df_full_train.columns if df_full_train[column].nunique() <= 5]

# Remove the output column
categorical_columns.remove('churn')
categorical_columns


['gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [55]:
# Numeric columns
numeric_columns = [
    column for column in df_full_train.columns if df_full_train[column].dtype != 'object']

# Drop output column
numeric_columns.remove('churn')

numeric_columns


['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges']

### Analyse the output

Examine the churn rate for:
1. Women
2. Men
3. People with / without partners
4. Overall

### Numerical data

**Mutual Information**

Amount of information commmon between two variables - deals with entropy of a variable. Useful for examinimg categorical data.

**Correlation**
Relation between two variables - useful for examining numerical data

In [56]:
# Correlation:
numeric_columns.append('churn')

# Generate correlation matrix
numeric_data = df_full_train[numeric_columns]
correlation_data = numeric_data.corr()
corr_matrix = correlation_data.values


In [17]:
# Set up plotting environment

# Text info to display the correlation information
text_info = np.round(corr_matrix, decimals=2).astype(str)

# Layout
Layout = go.Layout(title='Correlation heatmap of numerical data', autosize=False, width=600,
                   height=600)

# Data
Data = [go.Heatmap(x=numeric_columns, y=numeric_columns,
                   z=corr_matrix, text=text_info)]

figure = go.Figure(data=Data, layout=Layout)

py.iplot(figure)


In [18]:
# Mutual information
def mutual_info(parameter):
    """
    Returns the mutual information score between categorical column and output.
    In this case - output = df_full_train['churn']
    """
    return mutual_info_score(parameter, df_full_train['churn'])


m_score = df_full_train[categorical_columns].apply(mutual_info)
m_score.sort_values()


gender              0.000117
phoneservice        0.000229
multiplelines       0.000857
seniorcitizen       0.009410
partner             0.009968
dependents          0.012346
paperlessbilling    0.017589
streamingmovies     0.031581
streamingtv         0.031853
paymentmethod       0.043210
deviceprotection    0.043453
onlinebackup        0.046923
internetservice     0.055868
techsupport         0.061032
onlinesecurity      0.063085
contract            0.098320
dtype: float64

## One hot encoding

Encode all categorical columns using:
* Dictvectorizer
* Onehoencoder
of `scikit-learn`



In [19]:
# Check for categorical columns once again
categorical_columns


['gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [20]:
# Bug fix
numeric_columns.remove('churn')

# Variation 1: use dictvectorizer:
train_dicts = df_train[categorical_columns +
                       numeric_columns].to_dict(orient='records')

# Initialize the vectorizer:
dv = DictVectorizer()

# Encode the training data
x_train = dv.fit_transform(train_dicts)



DataFrame columns are not unique, some columns will be omitted.



In [21]:
# Encode the validation data
val_dicts = df_val[categorical_columns +
                   numeric_columns].to_dict(orient='records')

# Encode the validation data
x_val = dv.transform(val_dicts)



DataFrame columns are not unique, some columns will be omitted.



## Logistic Regression

Training the model using logistic regression

In [22]:
# Initializing model
model = LogisticRegression()
model.fit(x_train, y_train)






LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
# Checking the intercept (point at which the curve intersects)
model.intercept_


array([-0.12193489])

In [24]:
# Test and verify predictions
y_pred_val = model.predict_proba(x_val)[:, 1]

# Taking only those whose possibility of churn is greater than 0.5
churn_decision = (y_pred_val >= 0.5)

# Compare results:
df_predictions = pd.DataFrame()
df_predictions['predicted_usin_probs'] = churn_decision.astype(int)
df_predictions['predictions'] = model.predict(x_val)
df_predictions['actual'] = y_val

# Estimating accuracy:
df_predictions['correct'] = (
    df_predictions['predictions'] == df_predictions['actual'])

df_predictions


Unnamed: 0,predicted_usin_probs,predictions,actual,correct
0,0,0,0,True
1,0,0,0,True
2,0,0,0,True
3,1,1,1,True
4,0,0,0,True
...,...,...,...,...
1404,0,0,0,True
1405,0,0,1,False
1406,0,0,0,True
1407,1,1,1,True


In [25]:
# Accuracy on the validation data
df_predictions['correct'].mean()


0.8055358410220014

In [26]:
# Encode the test data
test_dicts = df_test[categorical_columns +
                     numeric_columns].to_dict(orient='records')

x_test = dv.transform(test_dicts)

# Making predictions
# Making predictions
y_test_predict = model.predict_proba(x_test)[:, 1]
churn_decision_test = (y_test_predict >= 0.5)

# Checking the accuracy of the test
(churn_decision_test == y_test).mean()



DataFrame columns are not unique, some columns will be omitted.



0.8097941802696949

## Logistic regression from scratch

In [27]:
def logistic_regression(array):
    """
    Return the value after passing through logistic function
    """
    return (1 + np.exp(-array))**(-1)


def train_logistic(X, y, lr, num_epochs=100):
    """
    Train the input data using logistic regression and gradient descent
    """
    # X = np.array(X, dtype=np.float128)
    X_dim = np.shape(X)

    # Choosing random model parameters to start with
    ß = np.random.randn(X_dim[1])

    # Train the model for epochs:
    for i in range(num_epochs):
        # Implement logistic regression
        prediction = logistic_regression(X@ß)

        # Estimating the loss:
        loss = y - prediction

        # Calculate the gradients:
        gradient = -X.T@loss

        # Update
        ß = ß - (lr*gradient)

    return ß


In [28]:
# Functions
model_parameters = train_logistic(x_train, y_train, lr=0.001)

# Accuracy on validation data
y_pred_val_scratch = logistic_regression(x_val@model_parameters)


(y_pred_val_scratch == y_val).mean()



overflow encountered in exp



0.7239176721078779

## Evaluation metrics

Accuracy measured at a threshold

In [29]:
# Generating the thresholds
thresholds = np.linspace(0, 1, 21)

scores = []

# Append the scores
for t in thresholds:
    score = accuracy_score(y_val, y_pred_val >= t)
    print('%.2f %.3f' % (t, score))
    scores.append(score)

scores



0.00 0.274
0.05 0.508
0.10 0.605
0.15 0.664
0.20 0.705
0.25 0.738
0.30 0.759
0.35 0.767
0.40 0.781
0.45 0.793
0.50 0.806
0.55 0.804
0.60 0.800
0.65 0.787
0.70 0.766
0.75 0.742
0.80 0.729
0.85 0.726
0.90 0.726
0.95 0.726
1.00 0.726


[0.2739531582682754,
 0.5081618168914124,
 0.6053938963804116,
 0.6635911994322214,
 0.7047551454932577,
 0.7381121362668559,
 0.759403832505323,
 0.7665010645848119,
 0.7806955287437899,
 0.7927608232789212,
 0.8055358410220014,
 0.8041163946061036,
 0.7998580553584103,
 0.78708303761533,
 0.765791341376863,
 0.7423704755145494,
 0.7288857345635202,
 0.7260468417317246,
 0.7260468417317246,
 0.7260468417317246,
 0.7260468417317246]

In [30]:
# Plot the data
data = [go.Scatter(x=thresholds, y=scores, mode='lines')]

layout = go.Layout(title = 'Accuracy at various thresholds for churn', xaxis_title = 'Threshold', yaxis_title = 'Accuracy')

figure = go.Figure(data=data, layout=layout)

py.iplot(figure)


Reason why the plot above is specified as accuracy for churn is that it shows the accuracy for only one class of outcome (recall the definition of logistic regression).

For the other class (not churn)

In [31]:
# Accuracy for the other class on validation data
1 - y_val.mean()

0.7260468417317246

## Confusion matrix

Used for classification algorithms - 

In [32]:
# Confusion matrix from scratch
# Setting true and false
true = (y_val == 1)
false = (y_val == 0)

# Setting the prediction
pred_positive = (y_pred_val >= 0.5)
pred_negative = (y_pred_val < 0.5)

# The confusion matrix
tp = (pred_positive & true).sum()
tn = (pred_negative & false).sum()
fp = (pred_positive & false).sum()
fn = (pred_negative & true).sum()

confusion_matrix = np.array([[tn, fp], [fn, tp]])
print(confusion_matrix)

# Percentage of values
confusion_matrix / confusion_matrix.sum()

[[923 100]
 [174 212]]


array([[0.65507452, 0.07097232],
       [0.12349184, 0.15046132]])

## Precision and recall

$$
\textup{Precision}=\frac{\textup{TP}}{\textup{TP}+\textup{FP}} \\
\textup{Recall} = \frac{\textup{TP}}{\textup{TP}+\textup{FN}}
$$



In [33]:
# Calculating precision and recall
precision = tp / (tp+fp)
recall = tp/(tp+fn)

print("Precision: {}\nRecall: {}".format(precision, recall))

Precision: 0.6794871794871795
Recall: 0.5492227979274611


In [34]:
# Calculating for all thresholds
thresholds = np.linspace(0, 1, 101)

confusion_score = []

for t in thresholds:
    pred_positive = (y_pred_val >= t)
    pred_negative = (y_pred_val < t)

    # The confusion matrix
    tp = (pred_positive & true).sum()
    tn = (pred_negative & false).sum()
    fp = (pred_positive & false).sum()
    fn = (pred_negative & true).sum()

    # Add the scores
    confusion_score.append([t, tp, fp, fn, tn])

# Convert to dataframe
confusion_scores = pd.DataFrame(confusion_score, columns=['threshold', 'true_positive', 
                                                'false_positive', 'false_negative', 'true_negative'])

confusion_scores.head()

Unnamed: 0,threshold,true_positive,false_positive,false_negative,true_negative
0,0.0,386,1023,0,0
1,0.01,385,901,1,122
2,0.02,383,808,3,215
3,0.03,382,753,4,270
4,0.04,382,714,4,309


In [35]:
# Adding precision and recall
confusion_scores['tpr'] = confusion_scores['true_positive'] / (confusion_scores['true_positive'] + confusion_scores['false_negative'])
confusion_scores['fpn'] = confusion_scores['false_positive'] / (confusion_scores['false_positive'] + confusion_scores['true_negative'])

confusion_scores.head()

Unnamed: 0,threshold,true_positive,false_positive,false_negative,true_negative,tpr,fpn
0,0.0,386,1023,0,0,1.0,1.0
1,0.01,385,901,1,122,0.997409,0.880743
2,0.02,383,808,3,215,0.992228,0.789834
3,0.03,382,753,4,270,0.989637,0.73607
4,0.04,382,714,4,309,0.989637,0.697947


In [36]:
# Plotting the true positive and false positive rates against threshold
trace1 = go.Scatter(x=confusion_scores['threshold'], y=confusion_scores['tpr'], mode='lines', name='tpr')
trace2 = go.Scatter(x=confusion_scores['threshold'], y=confusion_scores['fpn'], mode='lines', name='fpr')

data = [trace1, trace2]

layout = go.Layout(title='Scores plot against threshold', xaxis_title = 'Threshold', yaxis_title = 'Positive rates')

figure = go.Figure(data=data, layout=layout)

py.iplot(figure)


## ROC Curve

In [37]:
# ROC Curve from scratch
trace1 = go.Scatter(x=[0, 1], y=[0, 1], mode='lines+markers', line=dict(dash='dash'), name='Reference')
trace2 = go.Scatter(x=confusion_scores['fpn'], y=confusion_scores['tpr'], name='Model')

data = [trace1, trace2]

layout = go.Layout(title='ROC Curve', xaxis_title='False positive rates', yaxis_title='True positive rates', width=600, height=500)

figure = go.Figure(data=data, layout=layout)

py.iplot(figure)

In [38]:
# Verifying with sklearn
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_val, y_test)

# Plot the curve
race1 = go.Scatter(x=[0, 1], y=[0, 1], mode='lines+markers',
                   line=dict(dash='dash'), name='Reference')
trace2 = go.Scatter(
    x=confusion_scores['fpn'], y=confusion_scores['tpr'], name='Model')

data = [trace1, trace2]

layout = go.Layout(title='ROC Curve', xaxis_title='False positive rates',
                   yaxis_title='True positive rates', width=600, height=500)

figure = go.Figure(data=data, layout=layout)

py.iplot(figure)



In [39]:
# AUC score
from sklearn.metrics import auc, roc_auc_score

print(auc(tpr, fpr))
print(roc_auc_score(y_val, y_pred_val))

0.5023830145006812
0.8465981898206534


## K-Fold cross validation

In [65]:
numeric_columns.remove('churn')
numeric_columns

['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges']

In [72]:
# Defining the training module
def train(train_data, target, C):
    """
    Helper function for training
    """
    dicts = train_data[categorical_columns+numeric_columns].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C)
    model.fit(x_train, y_train)

    return dv, model

In [66]:
dv, model = train(df_train, y_train)


DataFrame columns are not unique, some columns will be omitted.





In [76]:
# Prediction functions
def predict(test_data, dv, model):
    """
    Predict values
    """
    dicts = test_data[categorical_columns+numeric_columns].to_dict(orient='records')
    x_test = dv.transform(dicts)
    prediction = model.predict_proba(x_test)[:, 1]

    return prediction

## Regularization for the model

It can be noted from the k-fold scores to decide which value would be best

In [69]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.62.3


In [70]:
# Validation
from sklearn.model_selection import KFold

# Displaying the proces
from tqdm.auto import tqdm

In [80]:
# Choosing the regularization value
reg_value = [0.001, 0.01, 0.1, 0.5, 1, 5, 10]

# Appending the scores


# No. of splits
n_splits = 5

# K-Fold validation
for value in tqdm(reg_value):
    k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
    scores = []

    # Validation for 1 split
    for train_idx, val_idx in k_fold.split(df_full_train):

        # Split into training and validation
        train_set = df_full_train.iloc[train_idx]
        val_set = df_full_train.iloc[val_idx]

        # Seperate the target and input
        y_train = train_set.churn.values
        y_val = val_set.churn.values

        # Train the model
        dv, model = train(train_set, y_train, C=value)
        
        # Make predictions
        y_pred = predict(val_set, dv, model)

        # Calculate auc scores
        auc = roc_auc_score(y_val, y_pred)

        # Append the scores
        scores.append(auc)
    
    # Give the score for 1 val
    print("C = {}: score = {:.2f}±{:.2f}".format(value, np.mean(scores), np.std(scores)))






DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.



C = 0.001: score = 0.83±0.01





DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.



C = 0.01: score = 0.84±0.01





DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.



C = 0.1: score = 0.84±0.01





DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.



C = 0.5: score = 0.84±0.01





DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.



C = 1: score = 0.84±0.01





DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.



C = 5: score = 0.84±0.01





DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.


DataFrame columns are not unique, some columns will be omitted.




DataFrame columns are not unique, some columns will be omitted.

100%|██████████| 7/7 [00:09<00:00,  1.41s/it]

C = 10: score = 0.84±0.01





In [71]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0
