In [None]:
import pandas as pd
import numpy as np
import joblib

In [None]:
train_path='data/trainNC.csv'
test_path='data/testNC.csv'

# Obtener dataset

In [None]:
df_gen = pd.read_csv('data/TCA_general.csv')
df_T1 = pd.read_csv('data/TCA_T1.csv')
df_T2 = pd.read_csv('data/TCA_T2.csv')

In [None]:
y = df_T2['NIVELRECUPERACION_T2']
y

In [None]:
df = df_T1

In [None]:
df = df.drop(['ID', 'FECHA', 'EDAD', 'DIAGNOSTICO', 'OTROSDIAG', 'FECHAALTA', 'NINGUNOTRODIAG', 'ANSIEDAD', 'DEPRESIÓN', 'TPERSO', 'TBIPOLAR', 'TPSICOTICO', 'OtrosdiagnósticosademásdelTCA_B', 'MEDICACION1', 'MEDICACION2', 'MEDICACION3'], axis=1)
import re

regex = re.compile(r'SEIQOLCUE')
df = df.drop(list(filter(regex.search, df.columns)), axis=1)

regex = re.compile(r'SEIRANK')
df = df.drop(list(filter(regex.search, df.columns)), axis=1)

regex = re.compile(r'RESI2coment')
df = df.drop(list(filter(regex.search, df.columns)), axis=1)

In [None]:
df['NR_T2'] = y

In [None]:
for column in df.columns:
    df[column] = df[column].replace(' ', '', regex=True).replace('', np.nan).astype(float)

In [None]:
df.dropna(subset=['NR_T2'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
for column in df_gen.columns:
    df_gen[column] = df_gen[column].replace(' ', '', regex=True).replace('', np.nan)

In [None]:
df_gen[['AÑOSPADECIENDO','AÑOSTTO','EDADCOMIENZO']].info()

In [None]:
df.info()

In [None]:
df.fillna(2, inplace=True)

In [None]:
df['EDAD'] = df_T1['EDAD'].replace(' ', '', regex=True).replace('', np.nan).astype(float)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

In [None]:
from sklearn.preprocessing import MinMaxScaler

X = df.drop(['NR_T2'], axis=1)

# Crea un objeto MinMaxScaler
scaler = MinMaxScaler()

# Escala los valores de las características en el rango [0, 1]
X = scaler.fit_transform(X)

# Convertir la matriz X a un DataFrame
df_scaled = pd.DataFrame(X, columns=df.columns[:-1])

# Añadir la columna 'NR_T2' al DataFrame escalado
df_scaled['NR_T2'] = df['NR_T2']

In [None]:
df_scaled['NR_T2'].unique()

In [None]:
from sklearn.model_selection import train_test_split

train_test_ratio = 0.8
df_train, df_test = train_test_split(df_scaled, train_size = train_test_ratio, random_state = 1)

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.to_csv(train_path)
df_test.to_csv(test_path)

# Análisis del dataset

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.hist(layout=(120, 4), figsize=(10,80));

In [None]:
# Compute correlation matrix
corr_matrix = df.corr()

# Find columns and indices where correlation coefficient is 1 or -1
corr_coeff_1_or_minus_1 = corr_matrix
correlation_pairs = []

# Iterate over the rows of the correlation matrix
for row in corr_coeff_1_or_minus_1.iterrows():
    col = row[0]
    corr_coeff = row[1].dropna()
    for index, value in corr_coeff.items():
        correlation_pairs.append((col, index, value))

# Display the correlated column pairs with correlation coefficient of 1 or -1
for pair in correlation_pairs:
    col1, col2, corr_coeff = pair
    if (corr_coeff > 0.9 or corr_coeff < -0.9) and col1 != col2:
      print("Column '{}' has correlation coefficient {} with column '{}'".format(col1, corr_coeff, col2))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation = df.corr()

# Create a heatmap with Seaborn
sns.heatmap(correlation, cmap="RdYlGn")

# Show the plot
plt.title("Correlation Matrix")
plt.show()

# Modelos

In [None]:
# Importar las bibliotecas necesarias
import pandas as pd
from sklearn.linear_model import LinearRegression, RANSACRegressor, TheilSenRegressor, HuberRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, r2_score
from sklearn.svm import SVR
from sklearn.inspection import permutation_importance
from xgboost import XGBRegressor

### Regresión lineal

In [None]:
def linear_regresion(X_train, y_train, X_test, y_test):
  # Create an instance of the LinearRegression model with default parameters
  model = LinearRegression()

  # Define the hyperparameters to tune and their possible values
  param_grid = {
      'fit_intercept': [True, False]
  }

  # Create an instance of the GridSearchCV with the model and parameter grid
  grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

  # Fit the GridSearchCV to the training data
  grid_search.fit(X_train, y_train)

  # Get the best parameters
  best_params = grid_search.best_params_
  # Imprimir los mejores valores de los hiperparámetros
  print("Mejores hiperparámetros: ", best_params)

  # Create a new model instance using the best parameters
  new_model = LinearRegression(**best_params)
  new_model.fit(X_train, y_train)
  lr_pred = new_model.predict(X_test)

  return lr_pred, param_grid['fit_intercept']

## RANSAC

In [None]:
def ransac_regression(X_train, y_train, X_test, y_test, fit_intercept=True):
    model = RANSACRegressor(base_estimator=LinearRegression(fit_intercept=False), min_samples=45)
    model.fit(X_train, y_train)
    rr_pred = model.predict(X_test)

    return rr_pred

## Theil Sen

In [None]:
def theil_sen_regression(X_train, y_train, X_test, y_test):
    model = TheilSenRegressor()
    param_grid = {'fit_intercept': [True, False]}

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print("Best hyperparameters: ", best_params)

    new_model = TheilSenRegressor(**best_params)
    new_model.fit(X_train, y_train)
    ts_pred = new_model.predict(X_test)

    return ts_pred

## HuberRegressor

In [None]:
def huber_regression(X_train, y_train, X_test, y_test):
    model = HuberRegressor()
    param_grid = {'fit_intercept': [True, False]}

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print("Best hyperparameters: ", best_params)

    new_model = HuberRegressor(**best_params)
    new_model.fit(X_train, y_train)
    hr_pred = new_model.predict(X_test)

    return hr_pred

## SVR

In [None]:
def svr_regression(X_train, y_train, X_test, y_test):
    model = SVR()
    param_grid = {'kernel': ['linear', 'rbf', 'poly']}

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print("Best hyperparameters: ", best_params)

    new_model = SVR(**best_params)
    new_model.fit(X_train, y_train)
    svr_pred = new_model.predict(X_test)

    return svr_pred, new_model

## XGBoost regression

In [None]:
def xgboost_regression(X_train, y_train, X_test, y_test):
    model = XGBRegressor()
    param_grid = {'learning_rate': [0.1, 0.01, 0.001]}

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print("Best hyperparameters: ", best_params)

    new_model = XGBRegressor(**best_params)
    new_model.fit(X_train, y_train)
    xgb_pred = new_model.predict(X_test)

    return xgb_pred

# Resultados

In [None]:
columns = ['Regresión Lineal', 'RANSAC', 'Theil Sen', 'HuberRegressor', 'SVR', 'XGBoost regression']
results = pd.DataFrame(columns=columns)
results

In [None]:
import matplotlib.pyplot as plt

def calculateImportance(svr, X_train):
    # Step 2: Retrieve feature weights
    feature_weights = np.abs(svr.coef_.ravel())

    # Step 3: Normalize feature weights
    normalized_weights = feature_weights / np.sum(feature_weights)

    # Step 4: Sort normalized feature weights
    sorted_indices = np.argsort(normalized_weights)[::1]
    sorted_weights = normalized_weights[sorted_indices]
    sorted_features = X_train.columns[sorted_indices]

    # Plot feature importance
    plt.figure(figsize=(8, 6))
    plt.barh(range(len(sorted_weights)), sorted_weights, align='center')
    plt.yticks(range(len(sorted_weights)), sorted_features)
    plt.xlabel('Feature Importance')
    plt.ylabel('Features')
    plt.show()

## Clasificación con dataset completo

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['NR_T2'], axis=1)
y_train = df_train['NR_T2']
X_test = df_test.drop(['NR_T2'], axis=1)
y_test = df_test['NR_T2']

In [None]:
lr_pred, fit_intercept = linear_regresion(X_train, y_train, X_test, y_test)
rr_pred = ransac_regression(X_train, y_train, X_test, y_test, fit_intercept)
ts_red = theil_sen_regression(X_train, y_train, X_test, y_test)
hr_pred = huber_regression(X_train, y_train, X_test, y_test)
svr_pred, svr = svr_regression(X_train, y_train, X_test, y_test)
xgb_pred = xgboost_regression(X_train, y_train, X_test, y_test)

In [None]:
results.loc['Completo'] = [r2_score(y_test, lr_pred), r2_score(y_test, rr_pred), r2_score(y_test, ts_red),
                           r2_score(y_test, hr_pred), r2_score(y_test, svr_pred), r2_score(y_test, xgb_pred)]

### Explainability

In [None]:
calculateImportance(svr, X_train)

## Clasificación con dataset completo (extracción de carácteristicas)

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [None]:
df_train_ec = pd.DataFrame()
df_test_ec = pd.DataFrame()

c = df_train.filter(regex='^WHOQOL').columns.tolist()
df_train_ec['WHOQOL'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^HAD').columns.tolist()
df_train_ec['HAD'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^EAT').columns.tolist()
df_train_ec['EAT'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^RESI[^_]').columns.tolist()
df_train_ec['RESI'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^SEIGOODDOING').columns.tolist()
df_train_ec['SEIGGOODDOING'] = df_train[c].sum(axis=1)
c = df_train.filter(regex='^RESI_').columns.tolist()
df_train_ec['RESI_ULTIM'] = df_train[c].sum(axis=1)
df_train_ec['NR_T2'] = df_train['NR_T2']

c = df_test.filter(regex='^WHOQOL').columns.tolist()
df_test_ec['WHOQOL'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^HAD').columns.tolist()
df_test_ec['HAD'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^EAT').columns.tolist()
df_test_ec['EAT'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^RESI[^_]').columns.tolist()
df_test_ec['RESI'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^SEIGOODDOING').columns.tolist()
df_test_ec['SEIGGOODDOING'] = df_test[c].sum(axis=1)
c = df_test.filter(regex='^RESI_').columns.tolist()
df_test_ec['RESI_ULTIM'] = df_test[c].sum(axis=1)
df_test_ec['NR_T2'] = df_test['NR_T2']

In [None]:
X_train = df_train_ec.drop(['NR_T2'], axis=1)
y_train = df_train_ec['NR_T2']
X_test = df_test_ec.drop(['NR_T2'], axis=1)
y_test = df_test_ec['NR_T2']

In [None]:
# lr_pred, fit_intercept = linear_regresion(X_train, y_train, X_test, y_test)
# rr_pred = ransac_regression(X_train, y_train, X_test, y_test, fit_intercept)
# ts_red = theil_sen_regression(X_train, y_train, X_test, y_test)
# hr_pred = huber_regression(X_train, y_train, X_test, y_test)
svr_pred, svr = svr_regression(X_train, y_train, X_test, y_test)
joblib.dump(svr, "svr_model.pkl")
xgb_pred = xgboost_regression(X_train, y_train, X_test, y_test)

In [None]:
results.loc['Completo (EC)'] = [r2_score(y_test, lr_pred), r2_score(y_test, rr_pred), r2_score(y_test, ts_red),
                           r2_score(y_test, hr_pred), r2_score(y_test, svr_pred), r2_score(y_test, xgb_pred)]

### Explainability

In [None]:
calculateImportance(svr, X_train)

## Clasificación con WHOQOL

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['NR_T2'], axis=1).filter(regex='^WHOQOL')
y_train = df_train['NR_T2']
X_test = df_test.drop(['NR_T2'], axis=1).filter(regex='^WHOQOL')
y_test = df_test['NR_T2']

In [None]:
lr_pred, fit_intercept = linear_regresion(X_train, y_train, X_test, y_test)
rr_pred = ransac_regression(X_train, y_train, X_test, y_test, fit_intercept)
ts_red = theil_sen_regression(X_train, y_train, X_test, y_test)
hr_pred = huber_regression(X_train, y_train, X_test, y_test)
svr_pred, svr = svr_regression(X_train, y_train, X_test, y_test)
xgb_pred = xgboost_regression(X_train, y_train, X_test, y_test)

In [None]:
results.loc['WHOQOL'] = [r2_score(y_test, lr_pred), r2_score(y_test, rr_pred), r2_score(y_test, ts_red),
                           r2_score(y_test, hr_pred), r2_score(y_test, svr_pred), r2_score(y_test, xgb_pred)]

### Explainability

In [None]:
calculateImportance(svr, X_train)

## Clasificación con HAD

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['NR_T2'], axis=1).filter(regex='^HAD')
y_train = df_train['NR_T2']
X_test = df_test.drop(['NR_T2'], axis=1).filter(regex='^HAD')
y_test = df_test['NR_T2']

In [None]:
lr_pred, fit_intercept = linear_regresion(X_train, y_train, X_test, y_test)
rr_pred = ransac_regression(X_train, y_train, X_test, y_test, fit_intercept)
ts_red = theil_sen_regression(X_train, y_train, X_test, y_test)
hr_pred = huber_regression(X_train, y_train, X_test, y_test)
svr_pred, svr = svr_regression(X_train, y_train, X_test, y_test)
xgb_pred = xgboost_regression(X_train, y_train, X_test, y_test)

In [None]:
results.loc['HAD'] = [r2_score(y_test, lr_pred), r2_score(y_test, rr_pred), r2_score(y_test, ts_red),
                           r2_score(y_test, hr_pred), r2_score(y_test, svr_pred), r2_score(y_test, xgb_pred)]

### Explainability

In [None]:
calculateImportance(svr, X_train)

## Clasificación con EAT

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['NR_T2'], axis=1).filter(regex='^EAT')
y_train = df_train['NR_T2']
X_test = df_test.drop(['NR_T2'], axis=1).filter(regex='^EAT')
y_test = df_test['NR_T2']

In [None]:
lr_pred, fit_intercept = linear_regresion(X_train, y_train, X_test, y_test)
rr_pred = ransac_regression(X_train, y_train, X_test, y_test, fit_intercept)
ts_red = theil_sen_regression(X_train, y_train, X_test, y_test)
hr_pred = huber_regression(X_train, y_train, X_test, y_test)
svr_pred, svr = svr_regression(X_train, y_train, X_test, y_test)
xgb_pred = xgboost_regression(X_train, y_train, X_test, y_test)

In [None]:
results.loc['EAT'] = [r2_score(y_test, lr_pred), r2_score(y_test, rr_pred), r2_score(y_test, ts_red),
                           r2_score(y_test, hr_pred), r2_score(y_test, svr_pred), r2_score(y_test, xgb_pred)]

### Explainability

In [None]:
calculateImportance(svr, X_train)

## Clasificación con RESI

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['NR_T2'], axis=1).filter(regex='^RESI[^_]')
y_train = df_train['NR_T2']
X_test = df_test.drop(['NR_T2'], axis=1).filter(regex='^RESI[^_]')
y_test = df_test['NR_T2']

In [None]:
lr_pred, fit_intercept = linear_regresion(X_train, y_train, X_test, y_test)
rr_pred = ransac_regression(X_train, y_train, X_test, y_test, fit_intercept)
ts_red = theil_sen_regression(X_train, y_train, X_test, y_test)
hr_pred = huber_regression(X_train, y_train, X_test, y_test)
svr_pred, svr = svr_regression(X_train, y_train, X_test, y_test)
xgb_pred = xgboost_regression(X_train, y_train, X_test, y_test)

In [None]:
results.loc['RESI'] = [r2_score(y_test, lr_pred), r2_score(y_test, rr_pred), r2_score(y_test, ts_red),
                        r2_score(y_test, hr_pred), r2_score(y_test, svr_pred), r2_score(y_test, xgb_pred)]

### Explainability

In [None]:
calculateImportance(svr, X_train)

## Clasificación con RED-5

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train[['RESI16', 'RESI27', 'RESI28', 'RESI30', 'RESI31']]
y_train = df_train['NR_T2']
X_test = df_test[['RESI16', 'RESI27', 'RESI28', 'RESI30', 'RESI31']]
y_test = df_test['NR_T2']

In [None]:
lr_pred, fit_intercept = linear_regresion(X_train, y_train, X_test, y_test)
rr_pred = ransac_regression(X_train, y_train, X_test, y_test, fit_intercept)
ts_red = theil_sen_regression(X_train, y_train, X_test, y_test)
hr_pred = huber_regression(X_train, y_train, X_test, y_test)
svr_pred, svr = svr_regression(X_train, y_train, X_test, y_test)
xgb_pred = xgboost_regression(X_train, y_train, X_test, y_test)

In [None]:
results.loc['RED5'] = [r2_score(y_test, lr_pred), r2_score(y_test, rr_pred), r2_score(y_test, ts_red),
                        r2_score(y_test, hr_pred), r2_score(y_test, svr_pred), r2_score(y_test, xgb_pred)]

### Explainability

In [None]:
calculateImportance(svr, X_train)

## Clasificación con SEIGOODDOING

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['NR_T2'], axis=1).filter(regex='^SEIGOODDOING')
y_train = df_train['NR_T2']
X_test = df_test.drop(['NR_T2'], axis=1).filter(regex='^SEIGOODDOING')
y_test = df_test['NR_T2']

In [None]:
lr_pred, fit_intercept = linear_regresion(X_train, y_train, X_test, y_test)
rr_pred = ransac_regression(X_train, y_train, X_test, y_test, fit_intercept)
ts_red = theil_sen_regression(X_train, y_train, X_test, y_test)
hr_pred = huber_regression(X_train, y_train, X_test, y_test)
svr_pred, svr = svr_regression(X_train, y_train, X_test, y_test)
xgb_pred = xgboost_regression(X_train, y_train, X_test, y_test)

In [None]:
results.loc['SEIGOODDOING'] = [r2_score(y_test, lr_pred), r2_score(y_test, rr_pred), r2_score(y_test, ts_red),
                           r2_score(y_test, hr_pred), r2_score(y_test, svr_pred), r2_score(y_test, xgb_pred)]

### Explainability

In [None]:
calculateImportance(svr, X_train)

## Clasificación con RESI_ULT

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

X_train = df_train.drop(['NR_T2'], axis=1).filter(regex='^RESI_')
y_train = df_train['NR_T2']
X_test = df_test.drop(['NR_T2'], axis=1).filter(regex='^RESI_')
y_test = df_test['NR_T2']

In [None]:
lr_pred, fit_intercept = linear_regresion(X_train, y_train, X_test, y_test)
rr_pred = ransac_regression(X_train, y_train, X_test, y_test, fit_intercept)
ts_red = theil_sen_regression(X_train, y_train, X_test, y_test)
hr_pred = huber_regression(X_train, y_train, X_test, y_test)
svr_pred, svr = svr_regression(X_train, y_train, X_test, y_test)
xgb_pred = xgboost_regression(X_train, y_train, X_test, y_test)

In [None]:
results.loc['RESI_ULT'] = [r2_score(y_test, lr_pred), r2_score(y_test, rr_pred), r2_score(y_test, ts_red),
                           r2_score(y_test, hr_pred), r2_score(y_test, svr_pred), r2_score(y_test, xgb_pred)]

### Explainability

In [None]:
calculateImportance(svr, X_train)

## Visualización y guardado

In [None]:
results

In [None]:
results.to_excel('resultados/resultadosNR.xlsx', index=True)
# v1 solo linear regression r_score
# v2 el resto que dijo almeida con r_score tambien
# v3 con fit_intercept con r_score tambien
# V4 CON RED5

explainability variable individual
- probar con los data generales