In [None]:
import pandas as pd
import numpy as np
import joblib

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_path='datos/trainEAT.csv'
test_path='datos/testEAT.csv'

# Obtener dataset

In [None]:
df_gen = pd.read_csv('datos/TCA_general.csv')
df_T1 = pd.read_csv('datos/TCA_T1.csv')
df_T2 = pd.read_csv('datos/TCA_T2.csv')

In [None]:
y = df_T2.filter(like='EAT')
for column in y.columns:
  y[column] = pd.to_numeric(y[column], errors='coerce')
y = y.sum(axis=1)
y = np.where(y > 20, 1, 0)
y

In [None]:
df = df_T1

In [None]:
df = df.drop(['ID', 'FECHA', 'EDAD', 'DIAGNOSTICO', 'OTROSDIAG', 'FECHAALTA', 'NINGUNOTRODIAG', 'ANSIEDAD', 'DEPRESIÓN', 'TPERSO', 'TBIPOLAR', 'TPSICOTICO', 'OtrosdiagnósticosademásdelTCA_B', 'MEDICACION1', 'MEDICACION2', 'MEDICACION3'], axis=1)
import re

regex = re.compile(r'SEIQOLCUE')
df = df.drop(list(filter(regex.search, df.columns)), axis=1)

regex = re.compile(r'SEIRANK')
df = df.drop(list(filter(regex.search, df.columns)), axis=1)

regex = re.compile(r'RESI2coment')
df = df.drop(list(filter(regex.search, df.columns)), axis=1)

In [None]:
df['labels'] = y

In [None]:
for column in df.columns:
    df[column] = df[column].replace(' ', '', regex=True).replace('', np.nan).astype(float)

In [None]:
for column in df_gen.columns:
    df_gen[column] = df_gen[column].replace(' ', '', regex=True).replace('', np.nan)

In [None]:
df_gen[['AÑOSPADECIENDO','AÑOSTTO','EDADCOMIENZO']].info()

In [None]:
df.fillna(2, inplace=True)

In [None]:
df['EDAD'] = df_T1['EDAD'].replace(' ', '', regex=True).replace('', np.nan).astype(float)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

In [None]:
conteo_clases = df['labels'].value_counts()

print(conteo_clases)

In [None]:
from sklearn.preprocessing import MinMaxScaler

X = df.drop(['labels'], axis=1)

# Crea un objeto MinMaxScaler
scaler = MinMaxScaler()

# Escala los valores de las características en el rango [0, 1]
X = scaler.fit_transform(X)

# Convertir la matriz X a un DataFrame
df_scaled = pd.DataFrame(X, columns=df.columns[:-1])

# Añadir la columna 'labels' al DataFrame escalado
df_scaled['labels'] = df['labels']

In [None]:
df_scaled.to_csv('datos/EAT.csv')

In [None]:
conteo_clases = df_scaled['labels'].value_counts()

print(conteo_clases)

In [None]:
from sklearn.model_selection import train_test_split

train_test_ratio = 0.8
df_train, df_test = train_test_split(df_scaled, train_size = train_test_ratio, stratify=df_scaled['labels'], random_state = 1)

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.to_csv(train_path)
df_test.to_csv(test_path)

# Análisis del dataset

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.hist(layout=(120, 4), figsize=(10,80));

In [None]:
# Compute correlation matrix
corr_matrix = df.corr()

# Find columns and indices where correlation coefficient is 1 or -1
corr_coeff_1_or_minus_1 = corr_matrix
correlation_pairs = []

# Iterate over the rows of the correlation matrix
for row in corr_coeff_1_or_minus_1.iterrows():
    col = row[0]
    corr_coeff = row[1].dropna()
    for index, value in corr_coeff.items():
        correlation_pairs.append((col, index, value))

# Display the correlated column pairs with correlation coefficient of 1 or -1
for pair in correlation_pairs:
    col1, col2, corr_coeff = pair
    if (corr_coeff > 0.9 or corr_coeff < -0.9) and col1 != col2:
      print("Column '{}' has correlation coefficient {} with column '{}'".format(col1, corr_coeff, col2))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation = df.corr()

# Create a heatmap with Seaborn
sns.heatmap(correlation, cmap="RdYlGn")

# Show the plot
plt.title("Correlation Matrix")
plt.show()

# Modelos

In [None]:
# Importar las bibliotecas necesarias
import pandas as pd
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

### Regresión logística

In [None]:
def logistic_regresion(X_train, y_train, X_test, y_test):
  # Definir el rango de valores de los hiperparámetros a probar
  param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

  # Definir el modelo
  lr = LogisticRegression()

  # Realizar la optimización de hiperparámetros utilizando GridSearchCV y validación cruzada
  grid = GridSearchCV(lr, param_grid, cv=5)
  grid.fit(X_train, y_train)

  # Imprimir los mejores valores de los hiperparámetros
  print("Mejores hiperparámetros: ", grid.best_params_)

  # Evaluar el modelo con los mejores hiperparámetros
  lr_best = LogisticRegression(C=grid.best_params_['C'])
  lr_best.fit(X_train, y_train)
  lr_pred = lr_best.predict(X_test)
  print("Precisión de regresión logística con los mejores hiperparámetros:", accuracy_score(y_test, lr_pred))

  return lr_pred

### Árbol de decisión

In [None]:
def decision_tree(X_train, y_train, X_test, y_test):
  # Definir el rango de valores de los hiperparámetros a probar
  param_grid = {'max_depth': [2, 4, 6, 8, 10],
                'min_samples_split': [2, 4, 6, 8, 10]}

  # Definir el modelo
  dt = DecisionTreeClassifier()

  # Realizar la optimización de hiperparámetros utilizando GridSearchCV y validación cruzada
  grid = GridSearchCV(dt, param_grid, cv=5)
  grid.fit(X_train, y_train)

  # Imprimir los mejores valores de los hiperparámetros
  print("Mejores hiperparámetros: ", grid.best_params_)

  # Evaluar el modelo con los mejores hiperparámetros
  dt_best = DecisionTreeClassifier(max_depth=grid.best_params_['max_depth'],
                                    min_samples_split=grid.best_params_['min_samples_split'])
  dt_best.fit(X_train, y_train)
  dt_pred = dt_best.predict(X_test)
  print("Precisión del árbol de decisión con los mejores hiperparámetros:", accuracy_score(y_test, dt_pred))

  return dt_pred

### SVM

In [None]:
def svm(X_train, y_train, X_test, y_test):
  # Definir el rango de valores de los hiperparámetros a probar
  param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 1, 10, 100]}

  # Definir el modelo
  svm = SVC()

  # Realizar la optimización de hiperparámetros utilizando GridSearchCV y validación cruzada
  grid = GridSearchCV(svm, param_grid, cv=5)
  grid.fit(X_train, y_train)

  # Imprimir los mejores valores de los hiperparámetros
  print("Mejores hiperparámetros: ", grid.best_params_)

  # Evaluar el modelo con los mejores hiperparámetros
  svm_best = SVC(C=grid.best_params_['C'], gamma=grid.best_params_['gamma'])
  svm_best.fit(X_train, y_train)
  svm_pred = svm_best.predict(X_test)
  print("Precisión de SVM con los mejores hiperparámetros:", accuracy_score(y_test, svm_pred))

  return svm_pred

### Redes neuronales

In [None]:
def neural_network(X_train, y_train, X_test, y_test):
  # Definir el rango de valores de los hiperparámetros a probar
  param_grid = {'hidden_layer_sizes': [(10,), (50,), (100,)],
                'activation': ['relu', 'logistic'],
                'learning_rate': ['constant', 'adaptive']}

  # Definir el modelo
  nn = MLPClassifier()

  # Realizar la optimización de hiperparámetros utilizando GridSearchCV y validación cruzada
  grid = GridSearchCV(nn, param_grid, cv=5)
  grid.fit(X_train, y_train)

  # Imprimir los mejores valores de los hiperparámetros
  print("Mejores hiperparámetros: ", grid.best_params_)

  # Evaluar el modelo con los mejores hiperparámetros
  nn_best = MLPClassifier(hidden_layer_sizes=grid.best_params_['hidden_layer_sizes'],
                          activation=grid.best_params_['activation'],
                          learning_rate=grid.best_params_['learning_rate'])
  nn_best.fit(X_train, y_train)
  nn_pred = nn_best.predict(X_test)
  print("Precisión de redes neuronales con los mejores hiperparámetros:", accuracy_score(y_test, nn_pred))

  return nn_pred

### KNN

In [None]:
def k_neighbors(X_train, y_train, X_test, y_test):
  # Definimos los posibles valores de los hiperparámetros que queremos probar
  param_grid = {'n_neighbors': [3, 5, 7],
                'weights': ['uniform', 'distance'],
                'p': [1, 2]}

  # Creamos el objeto GridSearchCV y lo ajustamos con los datos de entrenamiento
  knn = KNeighborsClassifier()
  grid_search = GridSearchCV(knn, param_grid=param_grid, cv=5)
  grid_search.fit(X_train, y_train)

  # Imprimimos los mejores valores de los hiperparámetros encontrados y la precisión correspondiente
  print("Mejores hiperparámetros: ", grid_search.best_params_)

  # Evaluar el modelo con los mejores hiperparámetros
  knn_best = KNeighborsClassifier(n_neighbors=grid_search.best_params_['n_neighbors'],
                          weights=grid_search.best_params_['weights'],
                          p=grid_search.best_params_['p'])
  knn_best.fit(X_train, y_train)
  knn_pred = knn_best.predict(X_test)
  print("Precisión de KNN:", accuracy_score(y_test, knn_pred))

  return knn_pred

### Random forest

In [None]:
def random_forest(X_train, y_train, X_test, y_test):
  # Definimos los posibles valores de los hiperparámetros que queremos probar
  param_grid = {'n_estimators': [100, 200, 500],
                'max_depth': [5, 10, 20],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]}

  # Creamos el objeto GridSearchCV y lo ajustamos con los datos de entrenamiento
  rf = RandomForestClassifier()
  grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)
  grid_search.fit(X_train, y_train)

  # Imprimimos los mejores valores de los hiperparámetros encontrados
  print("Mejores hiperparámetros: ", grid_search.best_params_)

  # Evaluar el modelo con los mejores hiperparámetros
  rf_best = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                          max_depth=grid_search.best_params_['max_depth'],
                          min_samples_split=grid_search.best_params_['min_samples_split'],
                          min_samples_leaf=grid_search.best_params_['min_samples_leaf'])
  rf_best.fit(X_train, y_train)
  rf_pred = rf_best.predict(X_test)
  print("Precisión de Random Forest:", accuracy_score(y_test, rf_pred))

  return rf_pred

### Gradient Boosting

In [None]:
def gradient_boosting(X_train, y_train, X_test, y_test):
  # Definimos los posibles valores de los hiperparámetros que queremos probar
  param_grid = {'n_estimators': [100, 200, 500],
                'learning_rate': [0.01, 0.1, 1],
                'max_depth': [3, 5, 10],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]}

  # Creamos el objeto GridSearchCV y lo ajustamos con los datos de entrenamiento
  gb = GradientBoostingClassifier()
  grid_search = GridSearchCV(gb, param_grid=param_grid, cv=5)
  grid_search.fit(X_train, y_train)

  # Imprimimos los mejores valores de los hiperparámetros encontrados
  print("Mejores hiperparámetros: ", grid_search.best_params_)

  # Evaluar el modelo con los mejores hiperparámetros
  gb_best = GradientBoostingClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                          learning_rate=grid_search.best_params_['learning_rate'],
                          max_depth=grid_search.best_params_['max_depth'],
                          min_samples_split=grid_search.best_params_['min_samples_split'],
                          min_samples_leaf=grid_search.best_params_['min_samples_leaf'])
  gb_best.fit(X_train, y_train)
  gb_pred = gb_best.predict(X_test)
  print("Precisión de Random Forest:", accuracy_score(y_test, gb_pred))
  return gb_pred

## Naive Bayes

In [None]:
def naive_bayes(X_train, y_train, X_test, y_test):
  # Crear una instancia del modelo Naive Bayes
  nb = GaussianNB()

  # Entrenar el modelo
  nb.fit(X_train, y_train)

  # Realizar predicciones en el conjunto de prueba
  nb_pred = nb.predict(X_test)

  # Evaluar la precisión del modelo
  print("Precisión de Naive Bayes:", accuracy_score(y_test, nb_pred))

  return nb_pred, nb

# Resultados

In [None]:
columns = ['Regresión Logística', '', '', '', 'Árbol de decisión', '', '', '', 'SVM', '', '', '', 'Red neuronal', '', '', '',
           'KNN', '', '', '', 'Random Forest', '', '', '', 'Gradient Boosting', '', '', '', 'Naive Bayes', '', '', '']
results = pd.DataFrame(columns=columns)
results.loc['Metrics'] = ['ROC AUC', 'Especificidad', 'Sensibilidad', 'F1-score',
                          'ROC AUC', 'Especificidad', 'Sensibilidad', 'F1-score',
                          'ROC AUC', 'Especificidad', 'Sensibilidad', 'F1-score',
                          'ROC AUC', 'Especificidad', 'Sensibilidad', 'F1-score',
                          'ROC AUC', 'Especificidad', 'Sensibilidad', 'F1-score',
                          'ROC AUC', 'Especificidad', 'Sensibilidad', 'F1-score',
                          'ROC AUC', 'Especificidad', 'Sensibilidad', 'F1-score',
                          'ROC AUC', 'Especificidad', 'Sensibilidad', 'F1-score']
results

In [None]:
def calculateSpecificity(y_true, y_pred):
  # Calcular la matriz de confusión
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

  # Calcular especificidad
  return tn / (tn + fp)

In [None]:
import matplotlib.pyplot as plt

def calculateImportance(nb, X, y):
    feature_names = X.columns

    # Calculate feature importance
    importance = permutation_importance(nb, X, y, scoring='roc_auc')

    # Get the indices of features sorted by importance
    feature_indices = importance.importances_mean.argsort()[::1]

    # Plot the feature importances
    plt.figure(figsize=(8, 8))
    plt.barh(range(len(feature_indices)), importance.importances_mean[feature_indices], color='b', align='center')
    plt.yticks(range(len(feature_indices)), feature_names[feature_indices])
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.show()

## Clasificación con dataset completo

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['labels'], axis=1)
y_train = df_train['labels']
X_test = df_test.drop(['labels'], axis=1)
y_test = df_test['labels']

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
dt_pred = decision_tree(X_train, y_train, X_test, y_test)

In [None]:
svm_pred = svm(X_train, y_train, X_test, y_test)

In [None]:
nn_pred = neural_network(X_train, y_train, X_test, y_test)

In [None]:
knn_pred = k_neighbors(X_train, y_train, X_test, y_test)

In [None]:
rf_pred = random_forest(X_train, y_train, X_test, y_test)

In [None]:
gb_pred = gradient_boosting(X_train, y_train, X_test, y_test)

In [None]:
nb_pred, nb_all = naive_bayes(X_train, y_train, X_test, y_test)

In [None]:
results.loc['Completo'] = [roc_auc_score(y_test, lr_pred), calculateSpecificity(y_test, lr_pred), recall_score(y_test, lr_pred), f1_score(y_test, lr_pred),
                           roc_auc_score(y_test, dt_pred), calculateSpecificity(y_test, dt_pred), recall_score(y_test, dt_pred), f1_score(y_test, dt_pred),
                           roc_auc_score(y_test, svm_pred), calculateSpecificity(y_test, svm_pred), recall_score(y_test, svm_pred), f1_score(y_test, svm_pred),
                           roc_auc_score(y_test, nn_pred), calculateSpecificity(y_test, nn_pred), recall_score(y_test, nn_pred), f1_score(y_test, nn_pred),
                           roc_auc_score(y_test, knn_pred), calculateSpecificity(y_test, knn_pred), recall_score(y_test, knn_pred), f1_score(y_test, knn_pred),
                           roc_auc_score(y_test, rf_pred), calculateSpecificity(y_test, rf_pred), recall_score(y_test, rf_pred), f1_score(y_test, rf_pred),
                           roc_auc_score(y_test, gb_pred), calculateSpecificity(y_test, gb_pred), recall_score(y_test, gb_pred), f1_score(y_test, gb_pred),
                           roc_auc_score(y_test, nb_pred), calculateSpecificity(y_test, nb_pred), recall_score(y_test, nb_pred), f1_score(y_test, nb_pred)]

### Explainability

In [None]:
calculateImportance(nb_all, X_test, y_test)

## Clasificación con dataset completo (extracción de carácteristicas)

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [None]:
df_train_ec = pd.DataFrame()
df_test_ec = pd.DataFrame()

c = df_train.filter(regex='^WHOQOL').columns.tolist()
df_train_ec['WHOQOL'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^HAD').columns.tolist()
df_train_ec['HAD'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^EAT').columns.tolist()
df_train_ec['EAT'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^RESI[^_]').columns.tolist()
df_train_ec['RESI'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^SEIGOODDOING').columns.tolist()
df_train_ec['SEIGGOODDOING'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^RESI_').columns.tolist()
df_train_ec['RESI_ULTIM'] = df_train[c].sum(axis=1)
df_train_ec['labels'] = df_train['labels']

c = df_test.filter(regex='^WHOQOL').columns.tolist()
df_test_ec['WHOQOL'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^HAD').columns.tolist()
df_test_ec['HAD'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^EAT').columns.tolist()
df_test_ec['EAT'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^RESI[^_]').columns.tolist()
df_test_ec['RESI'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^SEIGOODDOING').columns.tolist()
df_test_ec['SEIGGOODDOING'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^RESI_').columns.tolist()
df_test_ec['RESI_ULTIM'] = df_test[c].sum(axis=1)
df_test_ec['labels'] = df_test['labels']

In [None]:
X_train = df_train_ec.drop(['labels'], axis=1)
y_train = df_train_ec['labels']
X_test = df_test_ec.drop(['labels'], axis=1)
y_test = df_test_ec['labels']

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
dt_pred = decision_tree(X_train, y_train, X_test, y_test)

In [None]:
svm_pred = svm(X_train, y_train, X_test, y_test)

In [None]:
nn_pred = neural_network(X_train, y_train, X_test, y_test)

In [None]:
knn_pred = k_neighbors(X_train, y_train, X_test, y_test)

In [None]:
rf_pred = random_forest(X_train, y_train, X_test, y_test)

In [None]:
gb_pred = gradient_boosting(X_train, y_train, X_test, y_test)

In [None]:
nb_pred, nb_ec = naive_bayes(X_train, y_train, X_test, y_test)

In [None]:
results.loc['Completo (EC)'] = [roc_auc_score(y_test, lr_pred), calculateSpecificity(y_test, lr_pred), recall_score(y_test, lr_pred), f1_score(y_test, lr_pred),
                           roc_auc_score(y_test, dt_pred), calculateSpecificity(y_test, dt_pred), recall_score(y_test, dt_pred), f1_score(y_test, dt_pred),
                           roc_auc_score(y_test, svm_pred), calculateSpecificity(y_test, svm_pred), recall_score(y_test, svm_pred), f1_score(y_test, svm_pred),
                           roc_auc_score(y_test, nn_pred), calculateSpecificity(y_test, nn_pred), recall_score(y_test, nn_pred), f1_score(y_test, nn_pred),
                           roc_auc_score(y_test, knn_pred), calculateSpecificity(y_test, knn_pred), recall_score(y_test, knn_pred), f1_score(y_test, knn_pred),
                           roc_auc_score(y_test, rf_pred), calculateSpecificity(y_test, rf_pred), recall_score(y_test, rf_pred), f1_score(y_test, rf_pred),
                           roc_auc_score(y_test, gb_pred), calculateSpecificity(y_test, gb_pred), recall_score(y_test, gb_pred), f1_score(y_test, gb_pred),
                           roc_auc_score(y_test, nb_pred), calculateSpecificity(y_test, nb_pred), recall_score(y_test, nb_pred), f1_score(y_test, nb_pred)]

### Explainability

In [None]:
calculateImportance(nb_ec, X_test, y_test)

In [None]:
#calculateShapley(nb_ec, X_test, 0)

## Clasificación con WHOQOL

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['labels'], axis=1).filter(regex='^WHOQOL')
y_train = df_train['labels']
X_test = df_test.drop(['labels'], axis=1).filter(regex='^WHOQOL')
y_test = df_test['labels']

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
dt_pred = decision_tree(X_train, y_train, X_test, y_test)

In [None]:
svm_pred = svm(X_train, y_train, X_test, y_test)

In [None]:
nn_pred = neural_network(X_train, y_train, X_test, y_test)

In [None]:
knn_pred = k_neighbors(X_train, y_train, X_test, y_test)

In [None]:
rf_pred = random_forest(X_train, y_train, X_test, y_test)

In [None]:
gb_pred = gradient_boosting(X_train, y_train, X_test, y_test)

In [None]:
nb_pred, nb_who = naive_bayes(X_train, y_train, X_test, y_test)

In [None]:
results.loc['WHOQOL'] = [roc_auc_score(y_test, lr_pred), calculateSpecificity(y_test, lr_pred), recall_score(y_test, lr_pred), f1_score(y_test, lr_pred),
                           roc_auc_score(y_test, dt_pred), calculateSpecificity(y_test, dt_pred), recall_score(y_test, dt_pred), f1_score(y_test, dt_pred),
                           roc_auc_score(y_test, svm_pred), calculateSpecificity(y_test, svm_pred), recall_score(y_test, svm_pred), f1_score(y_test, svm_pred),
                           roc_auc_score(y_test, nn_pred), calculateSpecificity(y_test, nn_pred), recall_score(y_test, nn_pred), f1_score(y_test, nn_pred),
                           roc_auc_score(y_test, knn_pred), calculateSpecificity(y_test, knn_pred), recall_score(y_test, knn_pred), f1_score(y_test, knn_pred),
                           roc_auc_score(y_test, rf_pred), calculateSpecificity(y_test, rf_pred), recall_score(y_test, rf_pred), f1_score(y_test, rf_pred),
                           roc_auc_score(y_test, gb_pred), calculateSpecificity(y_test, gb_pred), recall_score(y_test, gb_pred), f1_score(y_test, gb_pred),
                           roc_auc_score(y_test, nb_pred), calculateSpecificity(y_test, nb_pred), recall_score(y_test, nb_pred), f1_score(y_test, nb_pred)]

### Explainability

In [None]:
calculateImportance(nb_who, X_test, y_test)

In [None]:
#calculateShapley(nb_who, X_test, 0)

## Clasificación con HAD

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['labels'], axis=1).filter(regex='^HAD')
y_train = df_train['labels']
X_test = df_test.drop(['labels'], axis=1).filter(regex='^HAD')
y_test = df_test['labels']

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
dt_pred = decision_tree(X_train, y_train, X_test, y_test)

In [None]:
svm_pred = svm(X_train, y_train, X_test, y_test)

In [None]:
nn_pred = neural_network(X_train, y_train, X_test, y_test)

In [None]:
knn_pred = k_neighbors(X_train, y_train, X_test, y_test)

In [None]:
rf_pred = random_forest(X_train, y_train, X_test, y_test)

In [None]:
gb_pred = gradient_boosting(X_train, y_train, X_test, y_test)

In [None]:
nb_pred, nb_had = naive_bayes(X_train, y_train, X_test, y_test)

In [None]:
results.loc['HAD'] = [roc_auc_score(y_test, lr_pred), calculateSpecificity(y_test, lr_pred), recall_score(y_test, lr_pred), f1_score(y_test, lr_pred),
                           roc_auc_score(y_test, dt_pred), calculateSpecificity(y_test, dt_pred), recall_score(y_test, dt_pred), f1_score(y_test, dt_pred),
                           roc_auc_score(y_test, svm_pred), calculateSpecificity(y_test, svm_pred), recall_score(y_test, svm_pred), f1_score(y_test, svm_pred),
                           roc_auc_score(y_test, nn_pred), calculateSpecificity(y_test, nn_pred), recall_score(y_test, nn_pred), f1_score(y_test, nn_pred),
                           roc_auc_score(y_test, knn_pred), calculateSpecificity(y_test, knn_pred), recall_score(y_test, knn_pred), f1_score(y_test, knn_pred),
                           roc_auc_score(y_test, rf_pred), calculateSpecificity(y_test, rf_pred), recall_score(y_test, rf_pred), f1_score(y_test, rf_pred),
                           roc_auc_score(y_test, gb_pred), calculateSpecificity(y_test, gb_pred), recall_score(y_test, gb_pred), f1_score(y_test, gb_pred),
                           roc_auc_score(y_test, nb_pred), calculateSpecificity(y_test, nb_pred), recall_score(y_test, nb_pred), f1_score(y_test, nb_pred)]

### Explainability

In [None]:
calculateImportance(nb_had, X_test, y_test)

In [None]:
#calculateShapley(nb_had, X_test, 0)

## Clasificación con EAT

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['labels'], axis=1).filter(regex='^EAT')
y_train = df_train['labels']
X_test = df_test.drop(['labels'], axis=1).filter(regex='^EAT')
y_test = df_test['labels']

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
dt_pred = decision_tree(X_train, y_train, X_test, y_test)

In [None]:
svm_pred = svm(X_train, y_train, X_test, y_test)

In [None]:
nn_pred = neural_network(X_train, y_train, X_test, y_test)

In [None]:
knn_pred = k_neighbors(X_train, y_train, X_test, y_test)

In [None]:
rf_pred = random_forest(X_train, y_train, X_test, y_test)

In [None]:
gb_pred = gradient_boosting(X_train, y_train, X_test, y_test)

In [None]:
nb_pred, nb_eat = naive_bayes(X_train, y_train, X_test, y_test)

In [None]:
results.loc['EAT'] = [roc_auc_score(y_test, lr_pred), calculateSpecificity(y_test, lr_pred), recall_score(y_test, lr_pred), f1_score(y_test, lr_pred),
                           roc_auc_score(y_test, dt_pred), calculateSpecificity(y_test, dt_pred), recall_score(y_test, dt_pred), f1_score(y_test, dt_pred),
                           roc_auc_score(y_test, svm_pred), calculateSpecificity(y_test, svm_pred), recall_score(y_test, svm_pred), f1_score(y_test, svm_pred),
                           roc_auc_score(y_test, nn_pred), calculateSpecificity(y_test, nn_pred), recall_score(y_test, nn_pred), f1_score(y_test, nn_pred),
                           roc_auc_score(y_test, knn_pred), calculateSpecificity(y_test, knn_pred), recall_score(y_test, knn_pred), f1_score(y_test, knn_pred),
                           roc_auc_score(y_test, rf_pred), calculateSpecificity(y_test, rf_pred), recall_score(y_test, rf_pred), f1_score(y_test, rf_pred),
                           roc_auc_score(y_test, gb_pred), calculateSpecificity(y_test, gb_pred), recall_score(y_test, gb_pred), f1_score(y_test, gb_pred),
                           roc_auc_score(y_test, nb_pred), calculateSpecificity(y_test, nb_pred), recall_score(y_test, nb_pred), f1_score(y_test, nb_pred)]

### Explainability

In [None]:
calculateImportance(nb_eat, X_test, y_test)

In [None]:
#calculateShapley(nb_eat, X_test, 0)

## Clasificación con RESI

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['labels'], axis=1).filter(regex='^RESI[^_]')
y_train = df_train['labels']
X_test = df_test.drop(['labels'], axis=1).filter(regex='^RESI[^_]')
y_test = df_test['labels']

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
dt_pred = decision_tree(X_train, y_train, X_test, y_test)

In [None]:
svm_pred = svm(X_train, y_train, X_test, y_test)

In [None]:
nn_pred = neural_network(X_train, y_train, X_test, y_test)

In [None]:
knn_pred = k_neighbors(X_train, y_train, X_test, y_test)

In [None]:
rf_pred = random_forest(X_train, y_train, X_test, y_test)

In [None]:
gb_pred = gradient_boosting(X_train, y_train, X_test, y_test)

In [None]:
nb_pred, nb_resi = naive_bayes(X_train, y_train, X_test, y_test)
joblib.dump(nb_resi, "nb_resi_model.pkl")

In [None]:
results.loc['RESI'] = [roc_auc_score(y_test, lr_pred), calculateSpecificity(y_test, lr_pred), recall_score(y_test, lr_pred), f1_score(y_test, lr_pred),
                           roc_auc_score(y_test, dt_pred), calculateSpecificity(y_test, dt_pred), recall_score(y_test, dt_pred), f1_score(y_test, dt_pred),
                           roc_auc_score(y_test, svm_pred), calculateSpecificity(y_test, svm_pred), recall_score(y_test, svm_pred), f1_score(y_test, svm_pred),
                           roc_auc_score(y_test, nn_pred), calculateSpecificity(y_test, nn_pred), recall_score(y_test, nn_pred), f1_score(y_test, nn_pred),
                           roc_auc_score(y_test, knn_pred), calculateSpecificity(y_test, knn_pred), recall_score(y_test, knn_pred), f1_score(y_test, knn_pred),
                           roc_auc_score(y_test, rf_pred), calculateSpecificity(y_test, rf_pred), recall_score(y_test, rf_pred), f1_score(y_test, rf_pred),
                           roc_auc_score(y_test, gb_pred), calculateSpecificity(y_test, gb_pred), recall_score(y_test, gb_pred), f1_score(y_test, gb_pred),
                           roc_auc_score(y_test, nb_pred), calculateSpecificity(y_test, nb_pred), recall_score(y_test, nb_pred), f1_score(y_test, nb_pred)]

### Explainability

In [None]:
calculateImportance(nb_resi, X_test, y_test)

In [None]:
#calculateShapley(nb_resi, X_test, 0)

## Clasificación con RED-5

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train[['RESI16', 'RESI27', 'RESI28', 'RESI30', 'RESI31']]
y_train = df_train['labels']
X_test = df_test[['RESI16', 'RESI27', 'RESI28', 'RESI30', 'RESI31']]
y_test = df_test['labels']

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
dt_pred = decision_tree(X_train, y_train, X_test, y_test)

In [None]:
svm_pred = svm(X_train, y_train, X_test, y_test)

In [None]:
nn_pred = neural_network(X_train, y_train, X_test, y_test)

In [None]:
knn_pred = k_neighbors(X_train, y_train, X_test, y_test)

In [None]:
rf_pred = random_forest(X_train, y_train, X_test, y_test)

In [None]:
gb_pred = gradient_boosting(X_train, y_train, X_test, y_test)

In [None]:
nb_pred, nb_resi = naive_bayes(X_train, y_train, X_test, y_test)

In [None]:
results.loc['RED5'] = [roc_auc_score(y_test, lr_pred), calculateSpecificity(y_test, lr_pred), recall_score(y_test, lr_pred), f1_score(y_test, lr_pred),
                           roc_auc_score(y_test, dt_pred), calculateSpecificity(y_test, dt_pred), recall_score(y_test, dt_pred), f1_score(y_test, dt_pred),
                           roc_auc_score(y_test, svm_pred), calculateSpecificity(y_test, svm_pred), recall_score(y_test, svm_pred), f1_score(y_test, svm_pred),
                           roc_auc_score(y_test, nn_pred), calculateSpecificity(y_test, nn_pred), recall_score(y_test, nn_pred), f1_score(y_test, nn_pred),
                           roc_auc_score(y_test, knn_pred), calculateSpecificity(y_test, knn_pred), recall_score(y_test, knn_pred), f1_score(y_test, knn_pred),
                           roc_auc_score(y_test, rf_pred), calculateSpecificity(y_test, rf_pred), recall_score(y_test, rf_pred), f1_score(y_test, rf_pred),
                           roc_auc_score(y_test, gb_pred), calculateSpecificity(y_test, gb_pred), recall_score(y_test, gb_pred), f1_score(y_test, gb_pred),
                           roc_auc_score(y_test, nb_pred), calculateSpecificity(y_test, nb_pred), recall_score(y_test, nb_pred), f1_score(y_test, nb_pred)]

### Explainability

In [None]:
calculateImportance(nb_resi, X_test, y_test)

In [None]:
#calculateShapley(nb_resi, X_test, 0)

## Clasificación con SEIGOODDOING

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['labels'], axis=1).filter(regex='^SEIGOODDOING')
y_train = df_train['labels']
X_test = df_test.drop(['labels'], axis=1).filter(regex='^SEIGOODDOING')
y_test = df_test['labels']

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
dt_pred = decision_tree(X_train, y_train, X_test, y_test)

In [None]:
svm_pred = svm(X_train, y_train, X_test, y_test)

In [None]:
nn_pred = neural_network(X_train, y_train, X_test, y_test)

In [None]:
knn_pred = k_neighbors(X_train, y_train, X_test, y_test)

In [None]:
rf_pred = random_forest(X_train, y_train, X_test, y_test)

In [None]:
gb_pred = gradient_boosting(X_train, y_train, X_test, y_test)

In [None]:
nb_pred, nb_sei = naive_bayes(X_train, y_train, X_test, y_test)

In [None]:
results.loc['SEIGOODDOING'] = [roc_auc_score(y_test, lr_pred), calculateSpecificity(y_test, lr_pred), recall_score(y_test, lr_pred), f1_score(y_test, lr_pred),
                           roc_auc_score(y_test, dt_pred), calculateSpecificity(y_test, dt_pred), recall_score(y_test, dt_pred), f1_score(y_test, dt_pred),
                           roc_auc_score(y_test, svm_pred), calculateSpecificity(y_test, svm_pred), recall_score(y_test, svm_pred), f1_score(y_test, svm_pred),
                           roc_auc_score(y_test, nn_pred), calculateSpecificity(y_test, nn_pred), recall_score(y_test, nn_pred), f1_score(y_test, nn_pred),
                           roc_auc_score(y_test, knn_pred), calculateSpecificity(y_test, knn_pred), recall_score(y_test, knn_pred), f1_score(y_test, knn_pred),
                           roc_auc_score(y_test, rf_pred), calculateSpecificity(y_test, rf_pred), recall_score(y_test, rf_pred), f1_score(y_test, rf_pred),
                           roc_auc_score(y_test, gb_pred), calculateSpecificity(y_test, gb_pred), recall_score(y_test, gb_pred), f1_score(y_test, gb_pred),
                           roc_auc_score(y_test, nb_pred), calculateSpecificity(y_test, nb_pred), recall_score(y_test, nb_pred), f1_score(y_test, nb_pred)]

### Explainability

In [None]:
calculateImportance(nb_sei, X_test, y_test)

In [None]:
#calculateShapley(nb_sei, X_test, 0)

## Clasificación con RESI_ULT

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['labels'], axis=1).filter(regex='^RESI_')
y_train = df_train['labels']
X_test = df_test.drop(['labels'], axis=1).filter(regex='^RESI_')
y_test = df_test['labels']

In [None]:
lr_pred = logistic_regresion(X_train, y_train, X_test, y_test)

In [None]:
dt_pred = decision_tree(X_train, y_train, X_test, y_test)

In [None]:
svm_pred = svm(X_train, y_train, X_test, y_test)

In [None]:
nn_pred = neural_network(X_train, y_train, X_test, y_test)

In [None]:
knn_pred = k_neighbors(X_train, y_train, X_test, y_test)

In [None]:
rf_pred = random_forest(X_train, y_train, X_test, y_test)

In [None]:
gb_pred = gradient_boosting(X_train, y_train, X_test, y_test)

In [None]:
nb_pred, nb_resi_ult = naive_bayes(X_train, y_train, X_test, y_test)

In [None]:
results.loc['RESI_ULT'] = [roc_auc_score(y_test, lr_pred), calculateSpecificity(y_test, lr_pred), recall_score(y_test, lr_pred), f1_score(y_test, lr_pred),
                           roc_auc_score(y_test, dt_pred), calculateSpecificity(y_test, dt_pred), recall_score(y_test, dt_pred), f1_score(y_test, dt_pred),
                           roc_auc_score(y_test, svm_pred), calculateSpecificity(y_test, svm_pred), recall_score(y_test, svm_pred), f1_score(y_test, svm_pred),
                           roc_auc_score(y_test, nn_pred), calculateSpecificity(y_test, nn_pred), recall_score(y_test, nn_pred), f1_score(y_test, nn_pred),
                           roc_auc_score(y_test, knn_pred), calculateSpecificity(y_test, knn_pred), recall_score(y_test, knn_pred), f1_score(y_test, knn_pred),
                           roc_auc_score(y_test, rf_pred), calculateSpecificity(y_test, rf_pred), recall_score(y_test, rf_pred), f1_score(y_test, rf_pred),
                           roc_auc_score(y_test, gb_pred), calculateSpecificity(y_test, gb_pred), recall_score(y_test, gb_pred), f1_score(y_test, gb_pred),
                           roc_auc_score(y_test, nb_pred), calculateSpecificity(y_test, nb_pred), recall_score(y_test, nb_pred), f1_score(y_test, nb_pred)]

### Explainability

In [None]:
calculateImportance(nb_resi_ult, X_test, y_test)

In [None]:
#calculateShapley(nb_resi_ult, X_test, 0)

## Visualización y guardado

In [None]:
results

In [None]:
results.to_excel('resultados/resultadosEAT.xlsx', index=True)
# v4 sin smote
# v5 smote en todo el dataset
# v6 smote solo en train
# v7 con RED5