# Trabajo ML

## Importamos las librerías necesarias

#### Librerías básicas para el tratamiento con datos y posteriores gráficos

In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#### Importamos funciones que iremos necesitando a lo largo del trabajo

In [2]:
from sklearn.utils import resample

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.preprocessing import LabelEncoder

# Librería necesaria para poder separar en conjuntos de training y testing
from sklearn.model_selection import train_test_split

#### Importamos también una librería que nos ayudará a codificar las variables categóricas

In [3]:
import category_encoders as ce

#### Importamos por otro lado la función tree necesaria para el posterior tree decision model

In [4]:
from sklearn import tree

#### Importamos las funciones necesarias para después poder hallar las medidas necesarias

In [5]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve, precision_score, accuracy_score, f1_score

In [6]:
from sklearn.feature_selection import SelectFromModel, RFE

#### Mostramos a continuación la creación de dos funciones propias que se han utilizado para encapsular código.

In [7]:
# Creamos la función evaluate_model que nos servirá más adelante para poder ver los valores de 
# diferentes medidas obtenidas por el modelo
def evaluate_model(ytest, ypred, ypred_proba = None):
    if ypred_proba is not None:
        print('ROC-AUC score of the model: {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))    

## Lectura del fichero csv

In [8]:
df_selected = pd.read_csv("../data/df_selected.csv")

In [9]:
# Echamos un vistazo rápido a la pinta que tiene el data frame
df_selected

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,application_type,acc_open_past_24mths,mort_acc,pub_rec_bankruptcies,tax_liens,disbursement_method,issue_month,issue_year,earliest_cr_year,credit_history
0,11575.0,11575.0,11575.0,36,7.35,359.26,A,A4,6,OWN,...,0,8.0,2.0,1.0,0.0,0,Jun,2017,1994,23
1,7200.0,7200.0,7200.0,36,24.85,285.70,E,E3,2,RENT,...,0,2.0,0.0,0.0,0.0,0,Jun,2017,2000,17
2,7500.0,7500.0,7500.0,36,7.35,232.79,A,A4,7,MORTGAGE,...,0,13.0,4.0,0.0,0.0,0,Jun,2017,2013,4
3,10000.0,10000.0,10000.0,60,16.02,243.29,C,C5,7,RENT,...,0,7.0,0.0,2.0,0.0,0,Jun,2017,2006,11
4,14000.0,14000.0,14000.0,36,16.02,492.34,C,C5,7,MORTGAGE,...,0,5.0,1.0,0.0,0.0,0,Jun,2017,2008,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442330,11575.0,11575.0,11575.0,36,15.59,404.61,D,D1,10,RENT,...,0,1.0,0.0,0.0,0.0,0,Jan,2015,1999,16
442331,12000.0,12000.0,12000.0,36,11.99,398.52,B,B5,1,MORTGAGE,...,0,11.0,1.0,0.0,0.0,0,Jan,2015,1995,20
442332,13000.0,13000.0,13000.0,60,15.99,316.07,D,D2,5,RENT,...,0,7.0,0.0,1.0,0.0,0,Jan,2015,2003,12
442333,12000.0,12000.0,12000.0,60,19.99,317.86,E,E3,1,RENT,...,0,6.0,0.0,0.0,0.0,0,Jan,2015,2003,12


#### Vamos ahora a mirar qué pinta tienen las variables del dataset. Para ello utilizaremos la función describe(). Añadiremos también un  include = "all" para obligar a que me incluya todas las columnas del dataset en el describe().

In [10]:
df_selected.describe(include = "all")

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,application_type,acc_open_past_24mths,mort_acc,pub_rec_bankruptcies,tax_liens,disbursement_method,issue_month,issue_year,earliest_cr_year,credit_history
count,442335.0,442335.0,442335.0,442335.0,442335.0,442335.0,442335,442335,442335.0,442335,...,442335.0,442335.0,442335.0,442335.0,442335.0,442335.0,442335,442335.0,442335.0,442335.0
unique,,,,,,,7,35,,4,...,,,,,,,12,,,
top,,,,,,,C,C1,,MORTGAGE,...,,,,,,,Oct,,,
freq,,,,,,,124885,26548,,220364,...,,,,,,,56111,,,
mean,14566.416856,14566.416856,14561.066882,42.140378,13.255884,439.93532,,,5.767755,,...,0.002258,4.795908,1.776111,0.147868,0.056475,2e-05,,2014.599414,1998.159006,16.440408
std,8500.018979,8500.018979,8496.756229,10.472109,4.471144,252.505972,,,3.722447,,...,0.04747,3.143748,2.089641,0.397284,0.419215,0.004511,,0.670304,7.464446,7.429709
min,1000.0,1000.0,900.0,36.0,5.32,14.01,,,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,,2014.0,1944.0,3.0
25%,8000.0,8000.0,8000.0,36.0,9.99,256.9,,,2.0,,...,0.0,3.0,0.0,0.0,0.0,0.0,,2014.0,1994.0,11.0
50%,12500.0,12500.0,12500.0,36.0,12.99,378.76,,,6.0,,...,0.0,4.0,1.0,0.0,0.0,0.0,,2015.0,2000.0,15.0
75%,20000.0,20000.0,20000.0,60.0,15.99,579.72,,,10.0,,...,0.0,6.0,3.0,0.0,0.0,0.0,,2015.0,2003.0,20.0


In [11]:
# filtro el df_selected en función de si la variable loan_status toma el valor 0 o 1
df_major = df_selected[df_selected.loan_status == 0]
df_minor = df_selected[df_selected.loan_status == 1]

In [12]:
df_minor_upsmapled = resample(df_minor, replace = True, n_samples = 358436, random_state = 12345)
df_minor_upsmapled

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,application_type,acc_open_past_24mths,mort_acc,pub_rec_bankruptcies,tax_liens,disbursement_method,issue_month,issue_year,earliest_cr_year,credit_history
400464,28250.0,28250.0,28250.0,36,15.61,987.76,D,D1,2,MORTGAGE,...,0,9.0,3.0,0.0,0.0,0,Apr,2015,1994,21
366148,12175.0,12175.0,12175.0,60,25.80,363.09,G,G1,1,RENT,...,0,8.0,1.0,0.0,0.0,0,May,2015,1997,18
22099,2000.0,2000.0,2000.0,36,18.54,72.85,E,E1,2,MORTGAGE,...,0,20.0,2.0,1.0,0.0,0,Dec,2014,1988,26
396340,5000.0,5000.0,5000.0,36,14.65,172.48,C,C5,6,OWN,...,0,3.0,0.0,0.0,0.0,0,Mar,2015,1981,34
32729,12400.0,12400.0,12400.0,60,8.67,255.43,B,B1,1,RENT,...,0,1.0,0.0,0.0,0.0,0,Nov,2014,1983,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157970,21075.0,21075.0,20975.0,60,19.47,552.17,D,D5,1,OWN,...,0,4.0,6.0,0.0,0.0,0,May,2014,1994,20
243800,10000.0,10000.0,10000.0,36,12.05,332.39,C,C1,1,OWN,...,0,3.0,0.0,0.0,0.0,0,Dec,2015,2005,10
21700,5825.0,5825.0,5825.0,36,18.54,212.17,E,E1,10,OWN,...,0,4.0,0.0,0.0,2.0,0,Dec,2014,2003,11
236605,35000.0,35000.0,34750.0,36,14.48,1204.40,C,C5,1,MORTGAGE,...,0,5.0,0.0,0.0,1.0,0,Dec,2015,2000,15


In [13]:
df_minor_upsmapled = pd.concat([df_minor_upsmapled, df_major])
df_minor_upsmapled

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,application_type,acc_open_past_24mths,mort_acc,pub_rec_bankruptcies,tax_liens,disbursement_method,issue_month,issue_year,earliest_cr_year,credit_history
400464,28250.0,28250.0,28250.0,36,15.61,987.76,D,D1,2,MORTGAGE,...,0,9.0,3.0,0.0,0.0,0,Apr,2015,1994,21
366148,12175.0,12175.0,12175.0,60,25.80,363.09,G,G1,1,RENT,...,0,8.0,1.0,0.0,0.0,0,May,2015,1997,18
22099,2000.0,2000.0,2000.0,36,18.54,72.85,E,E1,2,MORTGAGE,...,0,20.0,2.0,1.0,0.0,0,Dec,2014,1988,26
396340,5000.0,5000.0,5000.0,36,14.65,172.48,C,C5,6,OWN,...,0,3.0,0.0,0.0,0.0,0,Mar,2015,1981,34
32729,12400.0,12400.0,12400.0,60,8.67,255.43,B,B1,1,RENT,...,0,1.0,0.0,0.0,0.0,0,Nov,2014,1983,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442326,13150.0,13150.0,13150.0,60,14.99,312.77,C,C5,8,MORTGAGE,...,0,0.0,3.0,0.0,0.0,0,Jan,2015,1994,21
442327,12000.0,12000.0,12000.0,36,9.49,384.34,B,B2,4,RENT,...,0,3.0,0.0,0.0,0.0,0,Jan,2015,2006,9
442328,4000.0,4000.0,4000.0,36,8.67,126.59,B,B1,10,MORTGAGE,...,0,8.0,1.0,1.0,0.0,0,Jan,2015,2002,13
442330,11575.0,11575.0,11575.0,36,15.59,404.61,D,D1,10,RENT,...,0,1.0,0.0,0.0,0.0,0,Jan,2015,1999,16


In [14]:
df_minor_upsmapled.loan_status.value_counts()

1    358436
0    345335
Name: loan_status, dtype: int64

In [15]:
df_minor_upsmapled.dtypes

loan_amnt               float64
funded_amnt             float64
funded_amnt_inv         float64
term                      int64
int_rate                float64
installment             float64
grade                    object
sub_grade                object
emp_length                int64
home_ownership           object
annual_inc              float64
verification_status       int64
loan_status               int64
purpose                  object
addr_state               object
dti                     float64
delinq_2yrs             float64
fico_range_low          float64
fico_range_high         float64
open_acc                float64
pub_rec                 float64
revol_bal               float64
revol_util              float64
total_acc               float64
initial_list_status       int64
application_type          int64
acc_open_past_24mths    float64
mort_acc                float64
pub_rec_bankruptcies    float64
tax_liens               float64
disbursement_method       int64
issue_mo

In [16]:
X = df_minor_upsmapled.drop("loan_status", axis = 1)
Y = df_minor_upsmapled.loan_status

In [17]:
Y = LabelEncoder().fit_transform(Y)

## Comenzamos el estudio con semilla 0

#### Separamos los conjuntos X e Y en training y testing mediante la función train_test_split(). Para ello vamos a utilizar la semilla 0 y un tamaño de testing del 25%

In [18]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=0)

#### Veamos ahora qué tipo de variables tenemos en xtrain. Nos interesa tener las variables en formato numérico para poder hacer el posterior análisis mediante un árbol de decisión.

In [19]:
xtrain.dtypes

loan_amnt               float64
funded_amnt             float64
funded_amnt_inv         float64
term                      int64
int_rate                float64
installment             float64
grade                    object
sub_grade                object
emp_length                int64
home_ownership           object
annual_inc              float64
verification_status       int64
purpose                  object
addr_state               object
dti                     float64
delinq_2yrs             float64
fico_range_low          float64
fico_range_high         float64
open_acc                float64
pub_rec                 float64
revol_bal               float64
revol_util              float64
total_acc               float64
initial_list_status       int64
application_type          int64
acc_open_past_24mths    float64
mort_acc                float64
pub_rec_bankruptcies    float64
tax_liens               float64
disbursement_method       int64
issue_month              object
issue_ye

In [20]:
xtrain.describe()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,verification_status,dti,...,initial_list_status,application_type,acc_open_past_24mths,mort_acc,pub_rec_bankruptcies,tax_liens,disbursement_method,issue_year,earliest_cr_year,credit_history
count,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0,...,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0,527828.0
mean,14908.606213,14908.606213,14902.696015,43.586517,14.072943,445.751804,5.697413,73242.79,0.267265,19.167385,...,0.560042,0.00187,5.018891,1.65488,0.151765,0.058009,2.5e-05,2014.608869,1998.380552,16.228317
std,8493.332604,8493.332604,8489.83709,11.158916,4.565485,249.899654,3.736866,69838.32,0.442532,8.771095,...,0.496382,0.043202,3.227543,2.024245,0.403533,0.403209,0.004963,0.626757,7.442686,7.41424
min,1000.0,1000.0,900.0,36.0,5.32,14.01,0.0,100.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014.0,1944.0,3.0
25%,8225.0,8225.0,8225.0,36.0,10.99,266.06,2.0,45000.0,0.0,12.76,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2014.0,1995.0,11.0
50%,13175.0,13175.0,13175.0,36.0,13.67,386.18,6.0,62000.0,0.0,18.74,...,1.0,0.0,5.0,1.0,0.0,0.0,0.0,2015.0,2000.0,15.0
75%,20000.0,20000.0,20000.0,60.0,16.99,582.08,10.0,88000.0,1.0,25.3,...,1.0,0.0,7.0,3.0,0.0,0.0,0.0,2015.0,2003.0,20.0
max,40000.0,40000.0,40000.0,60.0,30.99,1587.23,10.0,8900060.0,1.0,999.0,...,1.0,1.0,53.0,34.0,12.0,85.0,1.0,2017.0,2014.0,70.0


#### Veamos qué columnas son de tipo object y cuales de tipo entero y float

In [21]:
num_xtrain = xtrain.select_dtypes(include=['int64', 'float64']).columns
cat_xtrain = xtrain.select_dtypes(include=['object']).columns

#### Dado que tenemos que modificar las columnas de tipo object, haremos uso del one hot encoding para convertirlas a numéricas. Este cambio se hace añadiendo columnas nuevas al data frame. Estas columnas tomarán únicamente los valores 1 o 0 dependiendo de si el individuo tiene una categoria determinada o no.

In [22]:
ohe = ce.OneHotEncoder(cols=cat_xtrain)
model = ohe.fit(xtrain)

In [23]:
x_train_t = model.transform(xtrain)
x_train_t

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade_1,grade_2,grade_3,grade_4,...,issue_month_6,issue_month_7,issue_month_8,issue_month_9,issue_month_10,issue_month_11,issue_month_12,issue_year,earliest_cr_year,credit_history
405387,13000.0,13000.0,13000.0,60,7.89,262.91,1,0,0,0,...,0,0,0,0,0,0,0,2015,2002,13
234295,10000.0,10000.0,10000.0,36,11.99,332.10,0,1,0,0,...,0,0,0,0,0,0,0,2015,1997,18
312077,25000.0,25000.0,25000.0,60,9.17,521.03,0,0,1,0,...,0,0,0,0,0,0,0,2015,1992,23
292927,35000.0,35000.0,35000.0,60,16.99,869.66,0,0,0,1,...,0,0,0,0,0,0,0,2015,1990,25
158822,35000.0,35000.0,35000.0,36,20.49,1309.49,0,0,0,0,...,0,0,0,0,0,0,0,2014,1998,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1429,12000.0,12000.0,11750.0,36,10.91,392.36,0,0,1,0,...,0,0,0,0,1,0,0,2017,2003,14
373782,18000.0,18000.0,18000.0,60,9.99,382.36,0,0,1,0,...,0,0,0,0,0,1,0,2015,2001,14
412304,12000.0,12000.0,12000.0,60,16.55,295.34,0,0,0,1,...,0,0,0,0,0,0,0,2015,2002,13
92961,25000.0,25000.0,25000.0,60,14.99,594.62,0,1,0,0,...,0,0,0,0,0,0,1,2014,1985,29


## Aplicamos 

In [29]:
clf = tree.DecisionTreeClassifier()

In [42]:
sel_ridge = SelectFromModel(clf, threshold = 1e-4)
sel_ridge.fit(x_train_t, ytrain)

SelectFromModel(estimator=DecisionTreeClassifier(), threshold=0.0001)

In [44]:
df_coeficientes = pd.DataFrame(
                        {'predictor': x_train_t.columns,
                         'coef': sel_ridge.estimator_.feature_importance.flatten()}
                  )

fig, ax = plt.subplots(figsize=(16, 3.84))
ax.stem(df_coeficientes.predictor, df_coeficientes.coef, markerfmt=' ')
plt.xticks(rotation=90, ha='right', size=10)
ax.set_xlabel('variable')
ax.set_ylabel('coeficientes')
ax.set_title('Coeficientes del modelo ridge');

AttributeError: 'DecisionTreeClassifier' object has no attribute 'feature_importance'

In [35]:
sel_ridge.estimator_

DecisionTreeClassifier()

In [49]:
list_ = list()
for i in np.arange(0.01,0.1,0.01):
    sel_ridge = SelectFromModel(clf, threshold = i)
    sel_ridge.fit(x_train_t, ytrain)
    selected_feat = x_train_t.columns[sel_ridge.get_support()]
    list_.append(len(selected_feat))

list_

[21, 15, 9, 6, 5, 4, 2, 1, 1]

In [28]:
rfe = RFE(clf, n_features_to_select=30)
fit = rfe.fit(x_train_t, ytrain)
fit.support_

KeyboardInterrupt: 

#### Ya estamos en condiciones para introducir el modelo de árbol de decisión. Para ello necesitamos la función tree que hemos importado al principio de la práctica

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train_t, ytrain)
clf

#### Dibujamos el decision tree con 4 nodos de exploración máximos para que sea más fácil de visualizar

In [None]:
tree.plot_tree(clf, max_depth=4, filled=True, rounded=True, fontsize=5, label="none")
plt.show()

In [None]:
x_test_t = model.transform(xtest)
x_test_t

## Comenzamos con las predicciones

In [None]:
predictions = clf.predict(x_test_t)
predictions

In [None]:
predictions_proba = clf.predict_proba(x_test_t)
predictions_proba


Hacemos la predicción de la cuarta fila del dataset


In [None]:
predict_4 = clf.predict(x_test_t)[4]
predict_4 == ytest[4]

In [None]:
print("Clasification report")
classification_report(ytest, predictions)

In [None]:
#DATAFRAME_medidas_1 = medidas(x_train_t, ytrain, x_test_t, ytest, profundidad=51)

In [None]:
#round(DATAFRAME_medidas_1.pct_change(),3)

In [None]:
#sns.lineplot(data = DATAFRAME_medidas_1)

### Curva ROC

In [None]:
# keep probabilities for the positive outcome only
yhat = predictions_proba[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='Decision Tree')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

### Matriz de confusión

#### Mostremos además la matriz de confusión. 

##### Por un lado mostraremos esta matriz en valores absolutos y, después la mostraremos en valores relativos. De esta forma podremos comparar mejor los valores obtenidos

In [None]:
cm_gbt = confusion_matrix(ytest, predictions)
cm_gbt_norm = confusion_matrix(ytest, predictions, normalize="true")

plt.figure(figsize=(7,7))
sns.heatmap(cm_gbt, annot=True, fmt=".3f", linewidths=1, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title("Confusion matrix")

plt.figure(figsize=(7,7))
sns.heatmap(cm_gbt_norm, annot=True, fmt=".3f", linewidths=1, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title("Normalized confusion matrix")

## Cambiamos de semilla a 12345.

#### Cambiamos la semilla de 0 a 12345. Realizaremos los mismos cálculos y compararemos los resultados obtenidos con ambas semillas

In [None]:
xtrain_2, xtest_2, ytrain_2, ytest_2 = train_test_split(X, Y, test_size=0.25, random_state=12345)

In [None]:
xtrain_2.dtypes

In [None]:
xtrain_2.describe()

In [None]:
num_xtrain_2 = xtrain_2.select_dtypes(include=['int64', 'float64']).columns
cat_xtrain_2 = xtrain_2.select_dtypes(include=['object']).columns

##### Ahora pasamos las variables categóricas a one hot encoding

In [None]:
ohe_2 = ce.OneHotEncoder(cols=cat_xtrain_2)
model_2 = ohe_2.fit(xtrain_2)

In [None]:
x_train_t_2 = model_2.transform(xtrain_2)
x_train_t_2

In [None]:
x_test_t_2 = model_2.transform(xtest_2)
x_test_t_2

In [None]:
clf_2 = tree.DecisionTreeClassifier()
clf_2 = clf_2.fit(x_train_t_2, ytrain_2)
clf_2

#### Dibujamos el decision tree con 4 nodos de exploración máximos para que sea más fácil de visualizar

In [None]:
tree.plot_tree(clf, max_depth=4, filled=True, rounded=True, fontsize=5, label="none")
plt.show()

In [None]:
x_test_t_2 = model_2.transform(xtest_2)
x_test_t_2

## Comenzamos con las predicciones para nuestra nueva semilla

In [None]:
predictions_2 = clf_2.predict(x_test_t_2)
predictions_2

In [None]:
predictions_proba_2 = clf_2.predict_proba(x_test_t_2)
predictions_proba_2


#### Hacemos la predicción de la cuarta fila del dataset


In [None]:
predict_4_2 = clf_2.predict(x_test_t_2)[4]
predict_4_2 == ytest_2[4]

In [None]:
print("Clasification report")
classification_report(ytest_2, predictions_2)

### Calculamos el data frame de las medidas

In [None]:
#DATAFRAME_medidas_2 = medidas(x_train_t_2, x_test_t=x_test_t_2, ytrain=ytrain_2, ytest=ytest_2, profundidad=31)

In [None]:
#round(DATAFRAME_medidas_2.pct_change(),3)

In [None]:
#sns.lineplot(data = DATAFRAME_medidas_2)

### Curva ROC

In [None]:
# keep probabilities for the positive outcome only
yhat_2 = predictions_proba_2[:, 1]
# calculate roc curves
fpr_2, tpr_2, thresholds_2 = roc_curve(ytest_2, yhat_2)
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr_2, tpr_2, marker='.', label='Decision Tree')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

### Matriz de confusión

#### Mostremos además la matriz de confusión. 

#### Por un lado mostraremos esta matriz en valores absolutos y, después la mostraremos en valores relativos. De esta forma podremos comparar mejor los valores obtenidos

In [None]:
cm_gbt_2 = confusion_matrix(ytest_2, predictions_2)
cm_gbt_norm_2 = confusion_matrix(ytest_2, predictions_2, normalize="true")

plt.figure(figsize=(7,7))
sns.heatmap(cm_gbt_2, annot=True, fmt=".3f", linewidths=1, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title("Confusion matrix")

plt.figure(figsize=(7,7))
sns.heatmap(cm_gbt_norm_2, annot=True, fmt=".3f", linewidths=1, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title("Normalized confusion matrix")

#### Creamos ahora un data frame en el que se muestra la medida F1 y los valores de accuracy y precision tanto para la semilla 0 como para la semilla 12345. También mostraremos las diferencias en porcentaje

In [None]:
p1 = precision_score(ytest, predictions)
p2 = precision_score(ytest_2, predictions_2)
f1 = f1_score(ytest, predictions)
f2 = f1_score(ytest_2, predictions_2)
ac1 = accuracy_score(ytest, predictions)
ac2 = accuracy_score(ytest_2, predictions_2)

dict_medidas = {'F1': [f1, f2, (abs(f1-f2)/max(f1,f2))*100], "accuracy" :[ac1, ac2, (abs(ac1-ac2)/max(ac1,ac2))*100], 
        'precision': [p1, p2, (abs(p1-p2)/max(p1,p2))*100]
       }
    
pd.DataFrame(dict_medidas, columns=["F1", "precision", 'accuracy'], index = ["semilla 0", "semilla 12345", 'diff %'])

#### Haciendo uso de la función evaluate_model creada al principio, podemos ver las diferentes medidas obtenidas para la semilla 0 y para la semilla 12345

In [None]:
evaluate_model(ytest, predictions, ypred_proba = predictions_proba)

In [None]:
evaluate_model(ytest_2, predictions_2, ypred_proba = predictions_proba_2)