In [1]:
import warnings
import numpy as np
import pandas as pd
from collections import Counter

# Sklearn imports
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
df = pd.read_csv("../Dados/telco_dataset.csv")

In [3]:
df.drop(['customerID'],axis=1, inplace=True)

In [4]:
df.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure              float64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

### Pré-Processamento

In [5]:
categorical_cols = [c for c in df.columns if df[c].dtype == 'object' or c == 'SeniorCitizen']
df_categorical = df[categorical_cols].copy()
for col in categorical_cols:
    if df_categorical[col].nunique() == 2:
        df_categorical[col], _ = pd.factorize(df_categorical[col])
    else:
        df_categorical = pd.get_dummies(df_categorical, columns=[col])

In [6]:
df_categorical.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,0,0,0,0,False,True,False,...,True,False,False,True,False,False,False,False,True,False
1,1,0,1,0,1,1,0,True,False,False,...,True,False,False,False,True,False,False,False,False,True
2,1,0,1,0,1,0,1,True,False,False,...,True,False,False,True,False,False,False,False,False,True


In [7]:
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols]),columns=num_cols)
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
df_std.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.992611
1,0.066327,-0.259629,-0.172165
2,-1.236724,-0.36266,-0.958066
3,0.514251,-0.746535,-0.193672
4,-1.236724,0.197365,-0.938874


In [8]:
df_processed = pd.concat([df_std, df_categorical], axis=1)
metrics = ['roc_auc', 'accuracy', 'precision', 'recall']
# https://scikit-learn.org/stable/modules/model_evaluation.html

In [9]:
df_processed.head(3)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.992611,0,0,0,0,0,0,0,...,True,False,False,True,False,False,False,False,True,False
1,0.066327,-0.259629,-0.172165,1,0,1,0,1,1,0,...,True,False,False,False,True,False,False,False,False,True
2,-1.236724,-0.36266,-0.958066,1,0,1,0,1,0,1,...,True,False,False,True,False,False,False,False,False,True


In [10]:
df_processed.shape

(7043, 41)

### Contruindo o modelo

In [11]:
train, test = train_test_split(df_processed, test_size=0.2, random_state=50)
predictors = [c for c in train.columns if c not in ['customerID', 'Churn']]

In [12]:
clf = LogisticRegression()
clf.fit(train[predictors], train['Churn'])
y_pred = clf.predict(test[predictors]) #0.5
y_prob = clf.predict_proba(test[predictors])[:, 1]
results = list()

In [13]:
y_pred

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [14]:
for m in metrics:
    if m == 'roc_auc':
        results.append((m, roc_auc_score(test['Churn'], y_prob)))
    else:
        results.append((m, eval("{}_score".format(m))(test['Churn'], y_pred)))
results

[('roc_auc', 0.8398723008812596),
 ('accuracy', 0.7998580553584103),
 ('precision', 0.6360759493670886),
 ('recall', 0.5461956521739131)]

- A precisão mede a quantidade de vezes que o seu modelo acerta em relação ao total de vezes que ele tenta acertar.
- O recall mede a quantidade de vezes que o seu modelo acerta em relação ao total de vezes que ele deveria ter acertado.

In [15]:
# otimização com Validação Cruzada
clf_cv = LogisticRegressionCV(cv=5, random_state=10)
clf_cv.fit(train[predictors], train['Churn'])
y_pred = clf_cv.predict(test[predictors])
y_prob = clf_cv.predict_proba(test[predictors])[:, 1]
results = list()

for m in metrics:
    if m == 'roc_auc':
        results.append((m, roc_auc_score(test['Churn'], y_prob)))
    else:
        results.append((m, eval("{}_score".format(m))(test['Churn'], y_pred)))
results

[('roc_auc', 0.8398279246543876),
 ('accuracy', 0.8005677785663591),
 ('precision', 0.6389776357827476),
 ('recall', 0.5434782608695652)]

In [16]:
def logistic_regression(frame, metrics, cv):
    train, test = train_test_split(frame, test_size=0.2, random_state=50)
    predictors = [c for c in train.columns if c not in ['customerID', 'Churn']]
    if cv == False:
        clf = LogisticRegression()
    else:
        clf = LogisticRegressionCV(cv=5, random_state=0)
    clf.fit(train[predictors], train['Churn'])
    y_pred = clf.predict(test[predictors])
    y_prob = clf.predict_proba(test[predictors])[:, 1]
    results = list()

    for m in metrics:
        if m == 'roc_auc':
            results.append((m, roc_auc_score(test['Churn'], y_prob)))
        else:
            results.append((m, eval("{}_score".format(m))(test['Churn'], y_pred)))

    c = dict(zip(predictors, abs(clf.coef_[0])))
    importance = dict(Counter(c).most_common(5)).keys()

    cm = confusion_matrix(test['Churn'], y_pred)
    especificidade = cm[0][0]/(cm[1][0]+cm[0][0])
    sensibilidade = cm[1][1]/(cm[1][1]+cm[0][1])

    return results, sensibilidade, especificidade, importance

In [17]:
logistic_regression(df_processed, metrics, cv = False)

([('roc_auc', 0.8398723008812596),
  ('accuracy', 0.7998580553584103),
  ('precision', 0.6360759493670886),
  ('recall', 0.5461956521739131)],
 0.6360759493670886,
 0.8472095150960659,
 dict_keys(['tenure', 'Contract_Two year', 'TotalCharges', 'Contract_Month-to-month', 'PhoneService']))

### Testando possibilidades

Criando features

In [18]:
df_tmp = df_processed.copy()
df_tmp['charges_difference'] = df_tmp['TotalCharges'] - df_tmp['tenure']*df_tmp['MonthlyCharges']
logistic_regression(df_tmp, metrics, cv = True)

([('roc_auc', 0.8401124545796266),
  ('accuracy', 0.8041163946061036),
  ('precision', 0.6483870967741936),
  ('recall', 0.5461956521739131)],
 0.6483870967741936,
 0.8480436760691538,
 dict_keys(['tenure', 'Contract_Month-to-month', 'Contract_Two year', 'MonthlyCharges', 'InternetService_Fiber optic']))

In [19]:
# Ratio between the tenure multiplied by monthly charges and TotalCharges
df_tmp = df_processed.copy()
df_tmp['charges_ratio'] = df_tmp['tenure']*df_tmp['MonthlyCharges'] / (df_tmp['TotalCharges'] + 1)
logistic_regression(df_tmp, metrics, cv = True)

([('roc_auc', 0.8415116109092429),
  ('accuracy', 0.8019872249822569),
  ('precision', 0.6403785488958991),
  ('recall', 0.5516304347826086)],
 0.6403785488958991,
 0.8489010989010989,
 dict_keys(['tenure', 'Contract_Two year', 'Contract_Month-to-month', 'TotalCharges', 'PaperlessBilling']))

In [20]:
df_tmp = df_processed.copy()
df_tmp['charges_difference'] = df_tmp['TotalCharges'] - df_tmp['tenure']*df_tmp['MonthlyCharges']
df_tmp['charges_ratio'] = df_tmp['tenure']*df_tmp['MonthlyCharges'] / (df_tmp['TotalCharges'] + 1)
logistic_regression(df_tmp, metrics, cv = True)

([('roc_auc', 0.8416447395898593),
  ('accuracy', 0.8034066713981547),
  ('precision', 0.6444444444444445),
  ('recall', 0.5516304347826086)],
 0.6444444444444445,
 0.8491773308957953,
 dict_keys(['tenure', 'Contract_Two year', 'Contract_Month-to-month', 'MonthlyCharges', 'InternetService_Fiber optic']))

Balanceamento

In [21]:
print(
    len(df_processed[df_processed["Churn"]==0]),
    len(df_processed[df_processed["Churn"]==1]),
    df_processed.shape
)

5174 1869 (7043, 41)


In [22]:
df_processed_no = df_processed[df_processed["Churn"]==0].sample(
    n=len(df_processed[df_processed["Churn"]==1]))
df_processed_yes = df_processed[df_processed["Churn"]==1]
df_processed_balanced = pd.concat([df_processed_no, df_processed_yes])
df_processed_balanced.shape

(3738, 41)

In [23]:
logistic_regression(df_processed_balanced, metrics, cv = True)

([('roc_auc', 0.8493137949037025),
  ('accuracy', 0.7526737967914439),
  ('precision', 0.7367149758454107),
  ('recall', 0.800524934383202)],
 0.7367149758454107,
 0.7724550898203593,
 dict_keys(['MonthlyCharges', 'tenure', 'InternetService_Fiber optic', 'InternetService_DSL', 'TotalCharges']))

In [24]:
df_tmp = df_processed_balanced.copy()
df_tmp['charges_difference'] = df_tmp['TotalCharges'] - df_tmp['tenure']*df_tmp['MonthlyCharges']
df_tmp['charges_ratio'] = df_tmp['tenure']*df_tmp['MonthlyCharges'] / (df_tmp['TotalCharges'] + 1)
logistic_regression(df_tmp, metrics, cv = True)

([('roc_auc', 0.8534474743790542),
  ('accuracy', 0.7647058823529411),
  ('precision', 0.7423167848699763),
  ('recall', 0.8241469816272966)],
 0.7423167848699763,
 0.7938461538461539,
 dict_keys(['Contract_Two year', 'Contract_Month-to-month', 'tenure', 'MonthlyCharges', 'InternetService_Fiber optic']))