In [None]:
import warnings
import numpy as np
import pandas as pd
from collections import Counter

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

**Contexto dos Dados**

- Base Kaggle: https://www.kaggle.com/datasets/blastchar/telco-customer-churn
- Telco é uma companhia que provê telefone e internet residencial
- Conteúdo: Cada linha representa um consumidor, cada coluna contém um atributito deste cliente na empresa

In [None]:
df = pd.read_csv("../../Dados/telco_dataset.csv")
df.drop(['customerID'],axis=1, inplace=True)

## Pré-processamento

### 1. Tratamento dos Dados

In [None]:
df['TotalCharges'] = df['TotalCharges'].replace(" ", 0)

In [None]:
num = ["tenure", "MonthlyCharges", "TotalCharges"]
categ = ['OnlineSecurity','TechSupport']

In [None]:
df_reduced = df[num+categ+["Churn"]]
df_reduced.head()

In [None]:
for col in num:
    try:df_reduced[col] = df_reduced[col].astype("float")
    except:print(col)

In [None]:
for col in categ:
    try:
        df_reduced[col] = df_reduced[col].astype("str")
        df_reduced[col] = df_reduced[col].str.strip()
    except:
        print(col)

### 2. Transformação dos Dados

In [None]:
categorical_cols = [c for c in df_reduced.columns if df_reduced[c].dtype == 'object']
df_categorical = df_reduced[categorical_cols].copy()
for col in categorical_cols:
    if df_categorical[col].nunique() == 2:
        df_categorical[col], _ = pd.factorize(df_categorical[col])
    else:
        df_categorical = pd.get_dummies(df_categorical, columns=[col])

In [None]:
df_categorical.columns

In [None]:
scaler = StandardScaler().fit(df_reduced[num]) 
df_std = pd.DataFrame(scaler.transform(df_reduced[num]), columns=num)

In [None]:
df_processed = pd.concat([df_std, df_categorical], axis=1)
df_processed.head()

In [None]:
df_no = df_processed[df_processed["Churn"]==0].sample(n=len(df_processed[df_processed["Churn"]==1]))
df_yes = df_processed[df_processed["Churn"]==1]
df_balanced = pd.concat([df_no, df_yes])

In [None]:
df_balanced['charges_ratio'] = df_balanced['tenure']*df_balanced['MonthlyCharges'] / (df_balanced['TotalCharges'] + 1)

In [None]:
df_balanced.shape

## Contruindo o modelo

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
train, test = train_test_split(df_balanced, test_size=0.2, random_state=50)
predictors = [c for c in train.columns if c not in ['customerID', 'Churn']]

In [None]:
predictors

In [None]:
test.head()

In [None]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(train[predictors], train['Churn'])
y_pred_tree = clf_tree.predict(test[predictors])
print(accuracy_score(test['Churn'], y_pred_tree))

In [None]:
fig = plt.figure(figsize=(20,10))
_ = tree.plot_tree(clf_tree, 
                   max_depth = 2,
                   feature_names=predictors,  
                   filled=True)

In [None]:
import pickle

In [None]:
pickle.dump(clf_tree, open("churn_tree_model.pkl", 'wb'))

In [None]:
pickle.dump(scaler, open("scaler.pkl", 'wb'))