# Evaluation metrics

Overview of different evaluation metrics that can be used with different models.

In [61]:
# import necessary dependencies:
import pandas as pd
import numpy as np

# Formatting output display
from IPython.display import display

# Plotting
import plotly.graph_objs as go
import plotly.offline as py

# Data validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

# One hot encoding
from sklearn.feature_extraction import DictVectorizer

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Accuracy
from sklearn.metrics import accuracy_score


In [2]:
# Dataset details - saved directly from kaggle
df = pd.read_csv('churn_data.csv')
df.head(10)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [3]:
df.iloc[:3]


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


## Data cleaning

Clean and preprocess the data

In [4]:
# Types of columns
df.dtypes


customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
# Preprocess the column names - all lowercase
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns


Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [6]:
# Process the string columns:
columns_with_strings = list(df.dtypes[df.dtypes == 'object'].index)

# Correct all the lower case:
for column in columns_with_strings:
    df[column] = df[column].str.lower().str.replace(' ', '_')

df.tail(10)


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
7033,9767-fflem,male,0,no,no,38,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,credit_card_(automatic),69.5,2625.25,no
7034,0639-tsiqw,female,0,no,no,67,yes,yes,fiber_optic,yes,...,yes,no,yes,no,month-to-month,yes,credit_card_(automatic),102.95,6886.25,yes
7035,8456-qdavc,male,0,no,no,19,yes,no,fiber_optic,no,...,no,no,yes,no,month-to-month,yes,bank_transfer_(automatic),78.7,1495.1,no
7036,7750-eyxwz,female,0,no,no,12,no,no_phone_service,dsl,no,...,yes,yes,yes,yes,one_year,no,electronic_check,60.65,743.3,no
7037,2569-wgero,female,0,no,no,72,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,yes,bank_transfer_(automatic),21.15,1419.4,no
7038,6840-resvb,male,0,yes,yes,24,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,yes,mailed_check,84.8,1990.5,no
7039,2234-xaduh,female,0,yes,yes,72,yes,yes,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,credit_card_(automatic),103.2,7362.9,no
7040,4801-jzazl,female,0,yes,yes,11,no,no_phone_service,dsl,yes,...,no,no,no,no,month-to-month,yes,electronic_check,29.6,346.45,no
7041,8361-ltmkd,male,1,yes,no,4,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,yes,mailed_check,74.4,306.6,yes
7042,3186-ajiek,male,0,no,no,66,yes,no,fiber_optic,yes,...,yes,yes,yes,yes,two_year,yes,bank_transfer_(automatic),105.65,6844.5,no


In [7]:
# Total charges column is of string type, bu should be numeric
total_charges = pd.to_numeric(df['totalcharges'], errors='coerce')

# Check corresponding customer ids for which totalcharges are null
df[total_charges.isnull()][['customerid', 'totalcharges']]


Unnamed: 0,customerid,totalcharges
488,4472-lvygi,_
753,3115-czmzd,_
936,5709-lvoeq,_
1082,4367-nuyao,_
1340,1371-dwpaz,_
3331,7644-omvmy,_
3826,3213-vvolg,_
4380,2520-sgtta,_
5218,2923-arzlg,_
6670,4075-wkniu,_


In [8]:
# Fill the values using zerofill
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')

df['totalcharges'] = df['totalcharges'].fillna(0)


## Processing categorical data

Categorical data can be processed by:
* Assigning values to each category (for binary - `[0, 1]`, for parameters with few categories - few integer values)
* Convert into long form table - binarization for each category - recommended if `no_of_categories` < 5
* Custom encoding


In [9]:
# Converting the churn into binary - 0 for no, 1 otherwise.
df['churn'] = df['churn'].apply(lambda val: val == 'yes').astype(int)
df['churn'].head(10)


0    0
1    0
2    1
3    0
4    1
5    1
6    0
7    0
8    1
9    0
Name: churn, dtype: int32

## Validation framework

Validation framework setup using `scikit-learn`

In [10]:
# Set up test data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

print("Length of the training set (sans validation set): {}\nLength of the test set: {}".format(
    len(df_full_train), len(df_test)))


Length of the training set (sans validation set): 5634
Length of the test set: 1409


In [11]:
# Set up validation data
df_train, df_val = train_test_split(
    df_full_train, test_size=0.25, random_state=1)
print("Length of training data: {}\nLength of validation data: {}".format(
    len(df_train), len(df_val)))
df_train.head(10)


Length of training data: 4225
Length of validation data: 1409


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
3897,8015-ihcgw,female,0,yes,yes,72,yes,yes,fiber_optic,yes,...,yes,yes,yes,yes,two_year,yes,electronic_check,115.5,8425.15,0
1980,1960-uycnn,male,0,no,no,10,yes,yes,fiber_optic,no,...,yes,no,no,yes,month-to-month,yes,electronic_check,95.25,1021.55,0
6302,9250-wypll,female,0,no,no,5,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,no,electronic_check,75.55,413.65,1
727,6786-obwqr,female,0,yes,yes,5,yes,no,fiber_optic,no,...,no,no,yes,no,month-to-month,yes,electronic_check,80.85,356.1,0
5104,1328-euzhc,female,0,yes,no,18,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,20.1,370.5,0
5387,8676-ooqej,male,0,no,no,4,no,no_phone_service,dsl,no,...,no,yes,no,no,month-to-month,no,electronic_check,30.5,118.4,0
459,1452-voqch,male,0,no,no,1,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,75.1,75.1,0
5023,6653-cbbom,female,0,no,no,1,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.3,70.3,1
6778,5893-kclgt,female,0,no,yes,72,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,yes,mailed_check,19.75,1567.0,0
1176,3992-ywpko,female,0,no,no,6,yes,yes,fiber_optic,no,...,yes,yes,yes,yes,month-to-month,yes,credit_card_(automatic),109.9,669.45,1


`test_size` was set to `0.25` in the previous cell as the split was done on `full_train`, **not on the original dataset**.
20% of 80% = 25%

In [12]:
# Reset indices:
def split_data(data):
    """
    Helper function for:
    1. Resetting index of the dataframe - For code readability
    2. Split the input from output
    """
    data = data.reset_index(drop=True)
    # Separate the output
    output = data['churn'].values
    # delete the columns
    del data['churn']
    return data, output


In [13]:
# Reset index and split input from output

df_train, y_train = split_data(df_train)
df_val, y_val = split_data(df_val)
df_test, y_test = split_data(df_test)
# df_train.head(10)


## Exploratory data analysis

* Handle missing values
* Examine the output column
* Process categorical data

In [14]:
df_full_train = df_full_train.reset_index(drop=True)
df_full_train['churn'].value_counts(normalize=True)


0    0.730032
1    0.269968
Name: churn, dtype: float64

Churn rate - 26.99% - No. of customers who sign up for the product

Also - mean of a binary parameter - %  of observations that map to `True`

In [15]:
# Checking categorical data
for column in df_full_train.columns:
    print("{}".format(column))
    print(df[column].unique()[:5])
    print("No. of unique values: {}".format(df[column].nunique()))


customerid
['7590-vhveg' '5575-gnvde' '3668-qpybk' '7795-cfocw' '9237-hqitu']
No. of unique values: 7043
gender
['female' 'male']
No. of unique values: 2
seniorcitizen
[0 1]
No. of unique values: 2
partner
['yes' 'no']
No. of unique values: 2
dependents
['no' 'yes']
No. of unique values: 2
tenure
[ 1 34  2 45  8]
No. of unique values: 73
phoneservice
['no' 'yes']
No. of unique values: 2
multiplelines
['no_phone_service' 'no' 'yes']
No. of unique values: 3
internetservice
['dsl' 'fiber_optic' 'no']
No. of unique values: 3
onlinesecurity
['no' 'yes' 'no_internet_service']
No. of unique values: 3
onlinebackup
['yes' 'no' 'no_internet_service']
No. of unique values: 3
deviceprotection
['no' 'yes' 'no_internet_service']
No. of unique values: 3
techsupport
['no' 'yes' 'no_internet_service']
No. of unique values: 3
streamingtv
['no' 'yes' 'no_internet_service']
No. of unique values: 3
streamingmovies
['no' 'yes' 'no_internet_service']
No. of unique values: 3
contract
['month-to-month' 'one_ye

In [16]:
# assign columns that are categorical
categorical_columns = [
    column for column in df_full_train.columns if df_full_train[column].nunique() <= 5]

# Remove the output column
categorical_columns.remove('churn')
categorical_columns


['gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [17]:
# Numeric columns
numeric_columns = [
    column for column in df_full_train.columns if df_full_train[column].dtype != 'object']

# Drop output column
numeric_columns.remove('churn')

numeric_columns


['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges']

### Analyse the output

Examine the churn rate for:
1. Women
2. Men
3. People with / without partners
4. Overall

In [18]:
# Overall churn rate - recall its the mean of the churn column normalized
overall_churn_rate = df_full_train['churn'].mean()

# Churn rate for the genders
churn_gender = df_full_train[['gender', 'churn']].groupby('gender').mean()
churn_gender


Unnamed: 0_level_0,churn
gender,Unnamed: 1_level_1
female,0.276824
male,0.263214


In [19]:
# Churn rate for people with / without partners
churn_partner = df_full_train[['partner', 'churn']].groupby('partner').mean()
churn_partner


Unnamed: 0_level_0,churn
partner,Unnamed: 1_level_1
no,0.329809
yes,0.205033


### Significant observations

* Churn rate depends on presence of the partner - a subscriber with no partner is more likely to subscribe than a person with partner
* Churn rate is slighly higher for women than for men - more women subscribers

the risk ratio concept below will further confirm the results

### Calculating risk ratio

Risk ratio measures how customer base is likely to churn out:
* risk ratio >= 1 $\implies$ base likely to churn
* risk ratio < 1, otherwise

In [20]:
# Calculating risk ratio for both genders
churn_gender['risk'] = churn_gender['churn'] / overall_churn_rate

# Difference ratio
churn_gender['diff'] = churn_gender['churn'] - overall_churn_rate

churn_gender


Unnamed: 0_level_0,churn,risk,diff
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,1.025396,0.006856
male,0.263214,0.97498,-0.006755


In [21]:
# Calculating risk for partner case
churn_partner['risk'] = churn_partner['churn'] / overall_churn_rate

# Difference
churn_partner['diff'] = churn_partner['churn'] - overall_churn_rate
churn_partner


Unnamed: 0_level_0,churn,risk,diff
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,1.221659,0.059841
yes,0.205033,0.759472,-0.064935


In [22]:
# View for all categorrical data
for column in categorical_columns:
    print("Category: {}".format(column))
    category = df_full_train[[column, 'churn']].groupby(column).mean()
    category['risk'] = category['churn'] / overall_churn_rate
    category['diff'] = category['churn'] - overall_churn_rate
    display(category)
    print()


Category: gender


Unnamed: 0_level_0,churn,risk,diff
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,1.025396,0.006856
male,0.263214,0.97498,-0.006755



Category: seniorcitizen


Unnamed: 0_level_0,churn,risk,diff
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,0.897403,-0.027698
1,0.413377,1.531208,0.143409



Category: partner


Unnamed: 0_level_0,churn,risk,diff
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,1.221659,0.059841
yes,0.205033,0.759472,-0.064935



Category: dependents


Unnamed: 0_level_0,churn,risk,diff
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,1.162212,0.043792
yes,0.165666,0.613651,-0.104302



Category: phoneservice


Unnamed: 0_level_0,churn,risk,diff
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,0.89387,-0.028652
yes,0.273049,1.011412,0.003081



Category: multiplelines


Unnamed: 0_level_0,churn,risk,diff
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,0.953474,-0.012561
no_phone_service,0.241316,0.89387,-0.028652
yes,0.290742,1.076948,0.020773



Category: internetservice


Unnamed: 0_level_0,churn,risk,diff
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,0.712482,-0.077621
fiber_optic,0.425171,1.574895,0.155203
no,0.077805,0.288201,-0.192163



Category: onlinesecurity


Unnamed: 0_level_0,churn,risk,diff
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,1.559152,0.150953
no_internet_service,0.077805,0.288201,-0.192163
yes,0.153226,0.56757,-0.116742



Category: onlinebackup


Unnamed: 0_level_0,churn,risk,diff
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,1.497672,0.134355
no_internet_service,0.077805,0.288201,-0.192163
yes,0.217232,0.80466,-0.052736



Category: deviceprotection


Unnamed: 0_level_0,churn,risk,diff
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,1.466379,0.125907
no_internet_service,0.077805,0.288201,-0.192163
yes,0.230412,0.85348,-0.039556



Category: techsupport


Unnamed: 0_level_0,churn,risk,diff
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,1.551717,0.148946
no_internet_service,0.077805,0.288201,-0.192163
yes,0.159926,0.59239,-0.110042



Category: streamingtv


Unnamed: 0_level_0,churn,risk,diff
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,1.269897,0.072864
no_internet_service,0.077805,0.288201,-0.192163
yes,0.302723,1.121328,0.032755



Category: streamingmovies


Unnamed: 0_level_0,churn,risk,diff
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,1.255358,0.068938
no_internet_service,0.077805,0.288201,-0.192163
yes,0.307273,1.138182,0.037305



Category: contract


Unnamed: 0_level_0,churn,risk,diff
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,1.599082,0.161733
one_year,0.120573,0.446621,-0.149395
two_year,0.028274,0.10473,-0.241694



Category: paperlessbilling


Unnamed: 0_level_0,churn,risk,diff
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,0.637375,-0.097897
yes,0.338151,1.25256,0.068183



Category: paymentmethod


Unnamed: 0_level_0,churn,risk,diff
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,0.622928,-0.101797
credit_card_(automatic),0.164339,0.608733,-0.10563
electronic_check,0.45589,1.688682,0.185922
mailed_check,0.19387,0.718121,-0.076098





### Numerical data

**Mutual Information**

Amount of information commmon between two variables - deals with entropy of a variable. Useful for examinimg categorical data.

**Correlation**
Relation between two variables - useful for examining numerical data

In [23]:
# Correlation:
numeric_columns.append('churn')

# Generate correlation matrix
numeric_data = df_full_train[numeric_columns]
correlation_data = numeric_data.corr()
corr_matrix = correlation_data.values


In [24]:
# Set up plotting environment

# Text info to display the correlation information
text_info = np.round(corr_matrix, decimals=2).astype(str)

# Layout
Layout = go.Layout(title='Correlation heatmap of numerical data', autosize=False, width=600,
                   height=600)

# Data
Data = [go.Heatmap(x=numeric_columns, y=numeric_columns,
                   z=corr_matrix, text=text_info)]

figure = go.Figure(data=Data, layout=Layout)

py.iplot(figure)


In [25]:
# Mutual information
def mutual_info(parameter):
    """
    Returns the mutual information score between categorical column and output.
    In this case - output = df_full_train['churn']
    """
    return mutual_info_score(parameter, df_full_train['churn'])


m_score = df_full_train[categorical_columns].apply(mutual_info)
m_score.sort_values()


gender              0.000117
phoneservice        0.000229
multiplelines       0.000857
seniorcitizen       0.009410
partner             0.009968
dependents          0.012346
paperlessbilling    0.017589
streamingmovies     0.031581
streamingtv         0.031853
paymentmethod       0.043210
deviceprotection    0.043453
onlinebackup        0.046923
internetservice     0.055868
techsupport         0.061032
onlinesecurity      0.063085
contract            0.098320
dtype: float64

## One hot encoding

Encode all categorical columns using:
* Dictvectorizer
* Onehoencoder
of `scikit-learn`



In [26]:
# Check for categorical columns once again
categorical_columns


['gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [27]:
# Bug fix
numeric_columns.remove('churn')

# Variation 1: use dictvectorizer:
train_dicts = df_train[categorical_columns +
                       numeric_columns].to_dict(orient='records')

# Initialize the vectorizer:
dv = DictVectorizer()

# Encode the training data
x_train = dv.fit_transform(train_dicts)



DataFrame columns are not unique, some columns will be omitted.



In [28]:
# Encode the validation data
val_dicts = df_val[categorical_columns +
                   numeric_columns].to_dict(orient='records')

# Encode the validation data
x_val = dv.transform(val_dicts)



DataFrame columns are not unique, some columns will be omitted.



## Logistic Regression

Training the model using logistic regression

In [29]:
# Initializing model
model = LogisticRegression()
model.fit(x_train, y_train)






LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
# Checking the intercept (point at which the curve intersects)
model.intercept_


array([-0.12193489])

In [31]:
# Test and verify predictions
y_pred_val = model.predict_proba(x_val)[:, 1]

# Taking only those whose possibility of churn is greater than 0.5
churn_decision = (y_pred_val >= 0.5)

# Compare results:
df_predictions = pd.DataFrame()
df_predictions['predicted_usin_probs'] = churn_decision.astype(int)
df_predictions['predictions'] = model.predict(x_val)
df_predictions['actual'] = y_val

# Estimating accuracy:
df_predictions['correct'] = (
    df_predictions['predictions'] == df_predictions['actual'])

df_predictions


Unnamed: 0,predicted_usin_probs,predictions,actual,correct
0,0,0,0,True
1,0,0,0,True
2,0,0,0,True
3,1,1,1,True
4,0,0,0,True
...,...,...,...,...
1404,0,0,0,True
1405,0,0,1,False
1406,0,0,0,True
1407,1,1,1,True


In [32]:
# Accuracy on the validation data
df_predictions['correct'].mean()


0.8055358410220014

In [44]:
# Encode the test data
test_dicts = df_test[categorical_columns +
                     numeric_columns].to_dict(orient='records')

x_test = dv.transform(test_dicts)

# Making predictions
# Making predictions
y_test_predict = model.predict_proba(x_test)[:, 1]
churn_decision_test = (y_test_predict >= 0.5)

# Checking the accuracy of the test
(churn_decision_test == y_test).mean()



DataFrame columns are not unique, some columns will be omitted.



0.8097941802696949

## Logistic regression from scratch

In [56]:
def logistic_regression(array):
    """
    Return the value after passing through logistic function
    """
    return (1 + np.exp(-array))**(-1)


def train_logistic(X, y, lr, num_epochs=100):
    """
    Train the input data using logistic regression and gradient descent
    """
    # X = np.array(X, dtype=np.float128)
    X_dim = np.shape(X)

    # Choosing random model parameters to start with
    ß = np.random.randn(X_dim[1])

    # Train the model for epochs:
    for i in range(num_epochs):
        # Implement logistic regression
        prediction = logistic_regression(X@ß)

        # Estimating the loss:
        loss = y - prediction

        # Calculate the gradients:
        gradient = -X.T@loss

        # Update
        ß = ß - (lr*gradient)

    return ß


In [65]:
# Functions
model_parameters = train_logistic(x_train, y_train, lr=0.001)

# Accuracy on validation data
y_pred_val = logistic_regression(x_val@model_parameters)


(y_pred_val == y_val).mean()



overflow encountered in exp



0.7239176721078779

## Evaluation metrics

Accuracy measured at a threshold

In [64]:
thresholds = np.linspace(0, 1, 21)

scores = []

for t in thresholds:
    score = accuracy_score(y_val, y_pred_val >= t)
    print('%.2f %.3f' % (t, score))
    scores.append(score)

scores



0.00 0.274
0.05 0.724
0.10 0.724
0.15 0.724
0.20 0.724
0.25 0.724
0.30 0.724
0.35 0.724
0.40 0.724
0.45 0.724
0.50 0.724
0.55 0.724
0.60 0.724
0.65 0.724
0.70 0.724
0.75 0.724
0.80 0.724
0.85 0.724
0.90 0.724
0.95 0.724
1.00 0.724


[0.2739531582682754,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779,
 0.7239176721078779]

In [50]:
churn = (y_test_predict >= 0.7)
(y_test == churn).mean()

0.8097941802696949

In [63]:
# Plot the data
data = [go.Scatter(x=thresholds, y=scores, mode='lines')]

layout = go.Layout(title = 'Accuracy at various thresholds for churn', xaxis_title = 'Threshold', yaxis_title = 'Accuracy')

figure = go.Figure(data=data, layout=layout)

py.iplot(figure)


Reason why the plot above is specified as accuracy for churn is that it shows the accuracy for only one class of outcome (recall the definition of logistic regression).

For the other class (not churn)

In [66]:
# Accuracy for the other class on validation data
1 - y_val.mean()

0.7260468417317246