# Machine Learning from the trenches









**Ariel Rossanigo**

### Quien soy?

* Ariel Rossanigo
* Profe de Inteligencia Artificial
* Developer, Data Scientist



### Objetivos de la charla

### Arboles de decisión

<center><img src="./decision_tree.png" heigth="100%"></center>


### El problema a atacar

<center><img src="./customer-churn.jpg" width="800px"></center>

Aca nos pasamos a modo Jupyter 

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Leemos el archivo y miramos un poco los datos

In [None]:
data = pd.read_csv('telecom_churn.csv')
data.columns = data.columns.str.lower().str.replace('( )+', '_', regex=True)

In [None]:
data.shape

In [None]:
data.head(4)

In [None]:
data.churn.value_counts(normalize=True)

In [None]:
data.describe(include='all')

### Separando conjuntos en train y test

*Por qué?*


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42)

train.shape, test.shape

### sklearn en una celda

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

columnas = train.loc[:, 'number_vmail_messages':'customer_service_calls'].columns

predictor = DecisionTreeClassifier(random_state=42)
predictor.fit(train[columnas], train.churn)

acc_train = metrics.accuracy_score(train.churn, predictor.predict(train[columnas]))
acc_test = metrics.accuracy_score(test.churn, predictor.predict(test[columnas]))

print(f"El accuracy en train es {acc_train:.2%}")
print(f"El accuracy en test es {acc_test:.2%}")

### Uso de pipelines y DataFrameMapper

In [None]:
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('features', DataFrameMapper([
        (columnas, None)
    ])),
    ('predictor', DecisionTreeClassifier(random_state=42))
])

# predictor.fit(train[columnas], train.churn)
pipeline.fit(train, train.churn)

# acc_train = metrics.accuracy_score(train.churn, predictor.predict(train[columnas]))
# acc_test = metrics.accuracy_score(test.churn, predictor.predict(test[columnas]))
acc_train = metrics.accuracy_score(train.churn, pipeline.predict(train))
acc_test = metrics.accuracy_score(test.churn, pipeline.predict(test))

print(f"El accuracy en train es {acc_train:.2%}")
print(f"El accuracy en test es {acc_test:.2%}")

### Mejor escribimos una función para mostrar las métricas


In [None]:
def show_metrics(model, train, test):    
    fig, axis = plt.subplots(1, 2, figsize=(15, 3))
    
    for (name, ds), ax in zip([('Train', train), ('Test ', test)], axis):
        y_true = ds.churn
        y_pred = model.predict(ds)
        accuracy = metrics.accuracy_score(y_true, y_pred) 
        precision = metrics.precision_score(y_true, y_pred) 
        recall = metrics.recall_score(y_true, y_pred) 
        f1_score = metrics.f1_score(y_true, y_pred) 
        print(f"{name} ## Accuracy: {accuracy:7.2%} Precision: {precision:7.2%} "
              f" Recall: {recall:7.2%} F1-score: {f1_score:7.2%}")
        sns.heatmap(metrics.confusion_matrix(y_true, y_pred), 
                    cbar=False, cmap='Greens', annot=True, fmt='d', ax=ax)
        ax.set_title(f'CM {name}')
        ax.set_ylabel('Real')
        ax.set_xlabel('Predicho')
        ax.set_xticklabels(['No churn', 'Churn'])
        ax.set_yticklabels(['No churn', 'Churn'])

In [None]:
show_metrics(pipeline, train, test)

### 2 Problemas: Overfitting

<center><img src="./overfitting.jpg" width="500px"></center>

In [None]:
pipeline = Pipeline(steps=[
    ('features', DataFrameMapper([
        (columnas, None)
    ])),
    ('predictor', DecisionTreeClassifier(max_depth=4, random_state=42))
])

pipeline.fit(train, train.churn)
show_metrics(pipeline, train, test)

### 2 Problemas: Clases desbalanceadas

<center><img src="./unbalance.png" width="400px"></center>

In [None]:
pipeline = Pipeline(steps=[
    ('features', DataFrameMapper([
        (columnas, None)
    ])),
    ('predictor', DecisionTreeClassifier(max_depth=4, class_weight='balanced', random_state=42))
])

pipeline.fit(train, train.churn)
show_metrics(pipeline, train, test)

### ¿Todos entienden las métricas? 

<center><img src="./misunderstandings.jpg" width="400px"></center>

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(15, 3))
for (name, ds), ax in zip([('Train', train), ('Test ', test)], axis):
    y_true = ds.churn
    y_prob = pipeline.predict_proba(ds)[:, 1]
    ax.set_title(name)
    df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
    df_sorted = df.sort_values(by='y_prob', ascending=False)
    df_sorted.y_true.cumsum().reset_index(drop=True).plot.line(ax=ax, grid=True)

In [None]:
def show_metrics(model, train, test, desired_recall=0.7, total_true=483):    
    fig, axis = plt.subplots(1, 2, figsize=(15, 3))
    
    for (name, ds), ax in zip([('Train', train), ('Test ', test)], axis):
        y_true = ds.churn
        y_pred = model.predict(ds)
        accuracy = metrics.accuracy_score(y_true, y_pred) 
        precision = metrics.precision_score(y_true, y_pred) 
        recall = metrics.recall_score(y_true, y_pred) 
        f1_score = metrics.f1_score(y_true, y_pred) 
        
        true_wanted = y_true.sum() * desired_recall       
        y_prob = pipeline.predict_proba(ds)[:, 1]
        df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
        df_sorted = df.sort_values(by='y_prob', ascending=False)
        df['found_so_far'] = df_sorted.y_true.cumsum()
        prec_at_recall = df.loc[df.found_so_far <= true_wanted, 'y_true'].mean()
        churns_to_find = int(total_true * desired_recall)
        report_size = int(churns_to_find / prec_at_recall)

        print(f"{name} ## Precision: {precision:6.2%} Recall: {recall:6.2%} F1-score: {f1_score:6.2%}")
        sns.heatmap(metrics.confusion_matrix(y_true, y_pred), 
                    cbar=False, cmap='Greens', annot=True, fmt='d', ax=ax)
        ax.set_title(f'CM {name}')
        ax.set_ylabel('Real')
        ax.set_xlabel('Predicho')
        ax.set_xticklabels(['No churn', 'Churn'])
        ax.set_yticklabels(['No churn', 'Churn'])
        
    true_wanted = y_true.sum() * desired_recall       
    y_prob = pipeline.predict_proba(ds)[:, 1]
    df = pd.DataFrame({'y_true': y_true, 'y_prob': y_prob})
    df_sorted = df.sort_values(by='y_prob', ascending=False)
    df['found_so_far'] = df_sorted.y_true.cumsum()
    prec_at_recall = df.loc[df.found_so_far <= true_wanted, 'y_true'].mean()
    churns_to_find = int(total_true * desired_recall)
    report_size = int(churns_to_find / prec_at_recall)
    print(f"{name} ## Report size to find {churns_to_find:d} of {total_true:d} churns: {report_size:d}")

In [None]:
show_metrics(pipeline, train, test, desired_recall=0.7)

### Quedaba más información...

In [None]:
train[[x for x in train.columns if x not in columnas]].head()

In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
def yes_no_to_boolean(x):
    return x == 'Yes'

pipeline = Pipeline(steps=[
    ('features', DataFrameMapper(
        [(c, None) for c in columnas] + [
        ('international_plan', FunctionTransformer(yes_no_to_boolean, validate=False)),
        ('voice_mail_plan', FunctionTransformer(yes_no_to_boolean, validate=False)),
        ('account_length', None),
    ])),
    ('predictor', DecisionTreeClassifier(max_depth=6, class_weight='balanced', random_state=42))
])


pipeline.fit(train, train.churn)
show_metrics(pipeline, train, test)

### Como decide?

In [None]:
from dtreeviz.trees import *

In [None]:
X = pipeline.steps[0][1].transform(train)
predictor = pipeline.steps[1][1]
names = pipeline.steps[0][1].transformed_names_
viz = dtreeviz(predictor, X, train.churn, target_name='churn', feature_names=names, 
               class_names=['No Churn', 'Churn'])
viz.view()

Un solo ejemplo

In [None]:
X = pipeline.steps[0][1].transform(train)
predictor = pipeline.steps[1][1]
names = pipeline.steps[0][1].transformed_names_
viz = dtreeviz(predictor, X_train=X, y_train=train.churn, target_name='churn', feature_names=names, 
               class_names=['No Churn', 'Churn'], 
               X=X[0])
viz.view()

### Algo que funciona mejor... Ensembles

<center><img src="./ensemble.jpg" width="400px"></center>

In [None]:
from xgboost import XGBClassifier

In [None]:
pipeline = Pipeline(steps=[
    ('features', DataFrameMapper(
        [(c, None) for c in columnas] + [
        ('international_plan', FunctionTransformer(yes_no_to_boolean, validate=False)),
        ('voice_mail_plan', FunctionTransformer(yes_no_to_boolean, validate=False)),
        ('account_length', None),
    ])),
    ('predictor', XGBClassifier(max_depth=5, random_state=42))
])

c = train.churn.value_counts()
class_weight = c.min() / c
sample_weight = train.churn.map(class_weight)

pipeline.fit(train, train.churn, predictor__sample_weight=sample_weight.values)
show_metrics(pipeline, train, test)

### Gracias! Preguntas?


<div style="float: left;"><img src="../common/imgs/man-qmark.jpg" width="300" align="middle"></div> 

<div>
<div>
  <img src="../common/imgs/gmail-1162901_960_720.png" style="width: 30px; float: left; vertical-align:middle; margin: 0px;">
  <span style="line-height:30px; vertical-align:middle; margin-left: 10px;">arielrossanigo@gmail.com</span>
</div>
<div>
  <img src="../common/imgs/twitter-312464_960_720.png" style="width: 30px; float: left; vertical-align:middle; margin: 0px;">
  <span style="line-height:30px; vertical-align:middle; margin-left: 10px;">@arielrossanigo</span>
</div>
<div>
  <img src="../common/imgs/github-154769__340.png" style="width: 30px; float: left; vertical-align:middle; margin: 0px;">
  <span style="line-height:30px; vertical-align:middle; margin-left: 10px;">https://github.com/arielrossanigo</span>
</div>
<div>
  <img src="../common/imgs/Linkedin_icon.svg" style="width: 30px; float: left; vertical-align:middle; margin: 0px;">
  <span style="line-height:30px; vertical-align:middle; margin-left: 10px;">https://www.linkedin.com/in/arielrossanigo/</span>
</div>

</div>