In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
%matplotlib inline

In [3]:
df = pd.read_csv('./data/Customer-Churn.csv')

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [5]:
df.shape

(7043, 21)

In [6]:
def data_prep(df):
    
    df1 = df.copy()
    
    df1.columns = df1.columns.str.lower().str.replace(' ', '_')

    categorical_columns = list(df1.dtypes[df1.dtypes == 'object'].index)

    for col in categorical_columns:
        df1[col] = df1[col].str.lower().str.replace(' ', '_')
        
    y = np.where(df1.churn == 'yes', 1, 0)
    
    df1['totalcharges'] = pd.to_numeric(df1.totalcharges, errors='coerce')
    df1['totalcharges'] = df1['totalcharges'].fillna(0)
    df1['gender'] = np.where(df1['gender'] == 'female', 1, 0)
    df1['partner'] = np.where(df1['partner'] == 'yes', 1, 0)
    df1['dependents'] = np.where(df1['dependents'] == 'yes', 1, 0)
    df1['phoneservice'] = np.where(df1['phoneservice'] == 'yes', 1, 0)
    df1['multiplelines'] = np.where(df1['multiplelines'] == 'yes', 1, 0)
    df1['internetservicefiber'] = np.where(df1['internetservice'] == 'fiber_optic', 1, 0)
    df1['internetservicedsl'] = np.where(df1['internetservice'] == 'dsl', 1, 0)
    df1['onlinesecurity'] = np.where(df1['onlinesecurity'] == 'yes', 1, 0)
    df1['onlinebackup'] = np.where(df1['onlinebackup'] == 'yes', 1, 0)
    df1['deviceprotection'] = np.where(df1['deviceprotection'] == 'yes', 1, 0)
    df1['techsupport'] = np.where(df1['techsupport'] == 'yes', 1, 0)
    df1['streamingtv'] = np.where(df1['streamingtv'] == 'yes', 1, 0)
    df1['streamingmovies'] = np.where(df1['streamingmovies'] == 'yes', 1, 0)
    df1['paperlessbilling'] = np.where(df1['paperlessbilling'] == 'yes', 1, 0)
    df1 = pd.get_dummies(columns=['contract', 'paymentmethod'], data=df1, drop_first=True, dtype=int)

    columns_to_drop = ['churn', 'customerid', 'internetservice', 'gender', 'phoneservice', 'multiplelines']
    df1.drop(columns=columns_to_drop, inplace=True)
    X = df1.values
    X = np.column_stack([np.ones(X.shape[0]), X])
    return X, y

In [7]:
X, y = data_prep(df)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
y_pred = logistic.predict_proba(X_val)[:, 1]

In [12]:
churn_decision = np.where(y_pred > 0.56, 1, 0)

In [13]:
np.sum(churn_decision == y_val) / len(y_val)

np.float64(0.7920511000709723)

In [17]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, churn_decision)

array([[960,  70],
       [223, 156]])

In [16]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

np.float64(0.8363398826754105)