In [None]:
import warnings
import numpy as np
import pandas as pd
from collections import Counter

# Sklearn imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

**Contexto dos Dados**

- Base Kaggle: https://www.kaggle.com/datasets/blastchar/telco-customer-churn
- Telco é uma companhia que provê telefone e internet residencial
- Conteúdo: Cada linha representa um consumidor, cada coluna contém um atributito deste cliente na empresa

In [None]:
df = pd.read_csv("../Dados/telco_dataset.csv")
df.drop(['customerID'],axis=1, inplace=True)
df.head()

In [None]:
df.info()

In [None]:
len(df[df['Churn'] == 'Yes'])/len(df)

## Pré-processamento

### 1. Tratamento dos Dados

In [None]:
df['TotalCharges'] = df['TotalCharges'].replace(" ", 0)

In [None]:
num = ["tenure", "MonthlyCharges", "TotalCharges"]
categ = ['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity',
          'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling','PaymentMethod']

In [None]:
for col in num:
    try:df[col] = df[col].astype("float")
    except:print(col)

In [None]:
for col in categ:
    try:
        df[col] = df[col].astype("str")
        df[col] = df[col].str.strip()
    except:
        print(col)

### 2. Transformação dos Dados

In [None]:
categorical_cols = [c for c in df.columns if df[c].dtype == 'object' or c == 'SeniorCitizen']
df_categorical = df[categorical_cols].copy()
for col in categorical_cols:
    if df_categorical[col].nunique() == 2:
        df_categorical[col], _ = pd.factorize(df_categorical[col])
    else:
        df_categorical = pd.get_dummies(df_categorical, columns=[col])

In [None]:
df_categorical.head(3)

In [None]:
df_categorical.columns

In [None]:
df[num].describe()

In [None]:
plt.figure(figsize=(8,3))
plt.title("Distribution for {}".format("TotalCharges"))
ax = sns.distplot(df["TotalCharges"])

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num]),columns=num)
df_std.describe()

In [None]:
# Verificar que a ditribuição se mantém
plt.figure(figsize=(8,3))
plt.title("Distribution for {}".format("TotalCharges"))
ax = sns.distplot(df_std["TotalCharges"])

In [None]:
df_processed = pd.concat([df_std, df_categorical], axis=1)
df_processed

In [None]:
df_processed.shape

## Contruindo o modelo

### 1. Modelo base

In [None]:
train, test = train_test_split(df_processed, test_size=0.2, random_state=50)
predictors = [c for c in train.columns if c not in ['customerID', 'Churn']]

In [None]:
predictors[:3]

In [None]:
clf = LogisticRegression()
clf.fit(train[predictors], train['Churn'])
y_pred = clf.predict(test[predictors]) #0.5
y_prob = clf.predict_proba(test[predictors])[:, 1]
results = list()

In [None]:
y_prob

In [None]:
# https://scikit-learn.org/stable/modules/model_evaluation.html
metrics = ['roc_auc', 'accuracy', 'precision']
for m in metrics:
    if m == 'roc_auc':
        results.append((m, roc_auc_score(test['Churn'], y_prob)))
    else:
        results.append((m, eval("{}_score".format(m))(test['Churn'], y_pred)))

In [None]:
results

In [None]:
results = list()
y_pred_new_threshold = (clf.predict_proba(test[predictors])[:, 1] >= 0.4).astype(int) 
for m in  metrics:
    results.append((m, eval("{}_score".format(m))(test['Churn'], y_pred_new_threshold)))
results

In [None]:
def logistic_regression(df, metrics):

    train, test = train_test_split(df, test_size=0.2, random_state=50)
    predictors = [c for c in train.columns if c not in ['customerID', 'Churn']]
    
    clf = LogisticRegression()
    clf.fit(train[predictors], train['Churn'])
    y_pred = clf.predict(test[predictors]) #0.5
    y_prob = clf.predict_proba(test[predictors])[:, 1]
    results = list()

    for m in metrics:
        if m == 'roc_auc':
            results.append((m, roc_auc_score(test['Churn'], y_prob)))
        else:
            results.append((m, eval("{}_score".format(m))(test['Churn'], y_pred)))

    return results

In [None]:
logistic_regression(df_processed, metrics)

### 2. Melhorando o modelo

In [None]:
print(
    len(df_processed[df_processed["Churn"]==0]),
    len(df_processed[df_processed["Churn"]==1])
)

In [None]:
df_processed_no = df_processed[df_processed["Churn"]==0].sample(n=len(df_processed[df_processed["Churn"]==1]))
df_processed_yes = df_processed[df_processed["Churn"]==1]

In [None]:
df_processed_balanced = pd.concat([df_processed_no, df_processed_yes])

In [None]:
df_processed_balanced.shape

In [None]:
plt.figure(figsize=(8,3))
plt.title("Distribution for {}".format("TotalCharges"))
ax = sns.distplot(df_processed_balanced["TotalCharges"])

In [None]:
logistic_regression(df_processed_balanced, metrics)

In [None]:
df_tmp = df_processed_balanced.copy()
df_tmp['charges_difference'] = df_tmp['TotalCharges'] - df_tmp['tenure']*df_tmp['MonthlyCharges']
df_tmp['charges_ratio'] = df_tmp['tenure']*df_tmp['MonthlyCharges'] / (df_tmp['TotalCharges'] + 1)
logistic_regression(df_tmp, metrics)

### 3. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
train, test = train_test_split(df_tmp, test_size=0.2, random_state=50)
predictors = [c for c in train.columns if c not in ['customerID', 'Churn']]

In [None]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(train[predictors], train['Churn'])
y_pred_tree = clf_tree.predict(test[predictors]) #0.5
y_prob_tree = clf_tree.predict_proba(test[predictors])[:, 1]

In [None]:
results = list()

for m in metrics:
    if m == 'roc_auc':
        results.append((m, roc_auc_score(test['Churn'], y_prob_tree)))
    else:
        results.append((m, eval("{}_score".format(m))(test['Churn'], y_pred_tree)))
results

In [None]:
fig = plt.figure(figsize=(20,10))
_ = tree.plot_tree(clf_tree, 
                   max_depth = 3,
                   feature_names=predictors,  
                   filled=True)

In [None]:
fig.savefig("decistion_tree.png")