In [1]:
from google.colab import drive
import pandas as pd
import shutil
import os
import re
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [3]:
folder_path = '/content/drive/MyDrive/Codigo Tesis Helicobacter Editado/Resultados'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Lista para almacenar los primeros 10 valores de la columna 0 de cada DataFrame
primeros = []

# Extraemos los primeros 10 valores de la columna 0 para cada DataFrame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path,sep=";")
    pattern = re.compile(r"('reduce_dim[^}]*(?:, )?)")

    cleaned_data = []
    removed_data = []

    for item in df['1'].head(1):
        reduced_part = re.findall(pattern, item)
        cleaned_part = re.sub(pattern, '', item)

        cleaned_data.append(cleaned_part)
        removed_data.append(reduced_part[0] if reduced_part else '')

    primeros.append({
        'archivo': df['0'].head(1).tolist(),
        'model': file,
        'reduce_dim': reduced_part,
        'params':cleaned_part,
        'best_score': df['2'].head(1).tolist()
    })

In [4]:
primeros=pd.DataFrame(primeros)
results=primeros.sort_values(by='best_score',ascending=False)
results.reset_index(drop=True)

Unnamed: 0,archivo,model,reduce_dim,params,best_score
0,[Sel4Pre3.csv],SVC.csv,"['reduce_dim': PCA(n_components=13, random_sta...","{'classification__C': 150, 'classification__ga...",[0.8955823293172691]
1,[Sel6Pre5.csv],GradientBoostingClassifier.csv,"['reduce_dim': PCA(n_components=23, random_sta...","{'classification__learning_rate': 0.2, 'classi...",[0.8875968992248062]
2,[Sel0Pre5.csv],KNeighborsClassifier.csv,"['reduce_dim': SelectKBest(k=8,\n s...","{'classification__n_neighbors': 2, 'classifica...",[0.8875968992248062]
3,[Sel7Pre10.csv],RandomForestClassifier.csv,"['reduce_dim': SelectKBest(k=24,\n ...","{'classification__max_depth': None, 'classific...",[0.8798586572438163]
4,[Sel1Pre5.csv],MLPClassifier.csv,"['reduce_dim': SelectKBest(k=9,\n s...","{'classification__hidden_layer_sizes': (72, 36...",[0.8798449612403101]
5,[Sel7Pre7.csv],XGBClassifier.csv,"['reduce_dim': PCA(n_components=24, random_sta...","{'classification__learning_rate': 0.1, 'classi...",[0.8582375478927203]


In [5]:
# Crear una lista vacía para almacenar los DataFrames individuales
dataframes = []

# Leer cada archivo CSV y agregar su contenido a la lista de DataFrames
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path,sep=";")
    dataframes.append(df)

# Concatenar todos los DataFrames en uno solo
df_final = pd.concat(dataframes, ignore_index=True)


In [6]:
df_final=df_final.drop(axis=1,columns="1")

In [7]:
df_agrupado = df_final.groupby('0')['2'].sum() / df_final.groupby('0')['2'].count()

# Convertir la serie resultante a un DataFrame
df_resultante = df_agrupado.reset_index()

# Renombrar las columnas
df_resultante.columns = ['Tipo de Seleccion', 'Promedio de resultados']
df_resultante=df_resultante.sort_values(by="Promedio de resultados",ascending=False)
df_resultante.reset_index(drop=True)

df_resultante.head(10)

Unnamed: 0,Tipo de Seleccion,Promedio de resultados
28,Sel7Pre10.csv,0.866313
26,Sel6Pre5.csv,0.862403
6,Sel1Pre5.csv,0.862403
2,Sel0Pre5.csv,0.862403
17,Sel4Pre3.csv,0.861446
20,Sel5Pre10.csv,0.858068
32,Sel8Pre10.csv,0.85689
12,Sel3Pre10.csv,0.855713
10,Sel2Pre5.csv,0.855297
16,Sel4Pre10.csv,0.852179


In [8]:
folder_path = '/content/drive/MyDrive/Codigo Tesis Helicobacter Editado/Var_Prev'

In [9]:
#Lista para almacenar dataframes
results1 = []
sm = SMOTE(random_state=42)

In [10]:
accuracys=0.0
file_path = os.path.join(folder_path, "Sel7Pre10.csv")
df = pd.read_csv(file_path, sep=",", header=0, index_col=0)
df = df.dropna()
df = df.sample(n=500, replace=True,random_state=42)

X = df.drop("Prevalencia", axis=1)
y = df["Prevalencia"]

# Determinar 'k', el número de variables
k = X.shape[1]
X_res, y_res = sm.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.332, stratify=y_res)
for _ in range(100):

  # Definir el pipeline con pasos vacíos
  pipeline = Pipeline([
          ('scaler', StandardScaler()),    # Escalado de características
          ('reduce_dim', 'passthrough'),   # Placeholder para la reducción de dimensionalidad
          ('classification', XGBClassifier ())        # Clasificador SVC
      ])

      # Definir los parámetros para GridSearch
  param_grid = [
          {
              'reduce_dim': [SelectKBest(score_func=mutual_info_classif)],
              'reduce_dim__k': [k//2],
              'classification__max_depth': [3],
              'classification__learning_rate': [0.15],
              'classification__n_estimators': [500]
          }
      ]

  # Configurar el clasificador con diferentes tamaños de capa oculta
  xgb_clf = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='accuracy')

  # Configurar el clasificador con diferentes tamaños de capa oculta
  xgb_clf.fit(X_train, y_train)

  test_pred = xgb_clf.predict(X_test)
  accuracy = accuracy_score(y_test, test_pred)
  accuracys=accuracys+accuracy

results1.append(("XGB", accuracys/100))

In [11]:
accuracys=0.0
for _ in range(100):
  file_path = os.path.join(folder_path, "Sel7Pre10.csv")
  df = pd.read_csv(file_path, sep=",", header=0, index_col=0)
  df = df.dropna()
  df = df.sample(n=500, replace=True,random_state=42)

  X = df.drop("Prevalencia", axis=1)
  y = df["Prevalencia"]

  # Determinar 'k', el número de variables
  k = X.shape[1]


  pipeline = Pipeline([
          ('scaler', StandardScaler()),    # Escalado de características
          ('reduce_dim', 'passthrough'),   # Placeholder para la reducción de dimensionalidad
          ('classification',GradientBoostingClassifier ())        # Clasificador SVC
      ])

      # Definir los parámetros para GridSearch
  param_grid = [
          {
              'reduce_dim': [SelectKBest(score_func=mutual_info_classif)],
              'reduce_dim__k': [k//2],
              'classification__n_estimators': [50],
              'classification__learning_rate': [0.2],
              'classification__max_depth': [6],
              'classification__min_samples_split': [15],
              'classification__min_samples_leaf': [4]
          }
      ]

  X_res, y_res = sm.fit_resample(X, y)
  X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.332, stratify=y_res)

  # Configurar el clasificador con diferentes tamaños de capa oculta
  GB_clf = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='accuracy')

  GB_clf.fit(X_train, y_train)

  test_pred = GB_clf.predict(X_test)
  accuracy = accuracy_score(y_test, test_pred)
  accuracys=accuracys+accuracy

results1.append(("GradientBoosting", accuracys/100))

In [12]:
accuracys=0.0
for _ in range(100):
  file_path = os.path.join(folder_path, "Sel7Pre10.csv")
  df = pd.read_csv(file_path, sep=",", header=0, index_col=0)
  df = df.dropna()
  df = df.sample(n=500, replace=True,random_state=42)

  X = df.drop("Prevalencia", axis=1)
  y = df["Prevalencia"]

  # Determinar 'k', el número de variables
  k = X.shape[1]
  # Definir el pipeline con pasos vacíos
  pipeline = Pipeline([
          ('scaler', StandardScaler()),    # Escalado de características
          ('reduce_dim', 'passthrough'),   # Placeholder para la reducción de dimensionalidad
          ('classification', SVC())        # Clasificador SVC
  ])

  # Definir los parámetros para GridSearch
  param_grid = [
          {
              'reduce_dim': [PCA()],
              'reduce_dim__n_components': [k//2],
              'classification__kernel': ['rbf'],
              'classification__C': [150],
              'classification__gamma':[0.1]
          }
  ]


  X_res, y_res = sm.fit_resample(X, y)
  X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.332, stratify=y_res)

  # Configurar el clasificador con diferentes tamaños de capa oculta
  svc_clf = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='accuracy')
  svc_clf.fit(X_train, y_train)

  test_pred = svc_clf.predict(X_test)
  accuracy = accuracy_score(y_test, test_pred)
  accuracys=accuracys+accuracy

results1.append(("SVC", accuracys/100))

In [13]:
accuracys=0.0
for _ in range(100):
  file_path = os.path.join(folder_path, "Sel7Pre10.csv")
  df = pd.read_csv(file_path, sep=",", header=0, index_col=0)
  df = df.dropna()
  df = df.sample(n=500, replace=True,random_state=42)

  X = df.drop("Prevalencia", axis=1)
  y = df["Prevalencia"]

  # Determinar 'k', el número de variables
  k = X.shape[1]

  # Definir el pipeline con pasos vacíos
  pipeline = Pipeline([
          ('scaler', StandardScaler()),    # Escalado de características
          ('reduce_dim', 'passthrough'),   # Placeholder para la reducción de dimensionalidad
          ('classification', RandomForestClassifier())        # Clasificador SVC
  ])

      # Definir los parámetros para GridSearch
  param_grid = [
          {
              'reduce_dim': [SelectKBest(score_func=mutual_info_classif)],
              'reduce_dim__k': [k//2],
              'classification__max_depth': [None],
              'classification__min_samples_split': [2],
              'classification__min_samples_leaf': [1]
          }
  ]

  X_res, y_res = sm.fit_resample(X, y)
  X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.332, stratify=y_res)

  # Configurar el clasificador con diferentes tamaños de capa oculta
  rf_clf = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='accuracy')

  rf_clf.fit(X_train, y_train)

  test_pred = rf_clf.predict(X_test)
  accuracy = accuracy_score(y_test, test_pred)
  accuracys=accuracys+accuracy

results1.append(("RandomForest", accuracys/100))

In [14]:
accuracys=0.0
file_path = os.path.join(folder_path, "Sel7Pre10.csv"	)
df = pd.read_csv(file_path, sep=",", header=0, index_col=0)
df = df.dropna()
df = df.sample(n=500, replace=True,random_state=42)
X = df.drop("Prevalencia", axis=1)
y = df["Prevalencia"]
# Determinar 'k', el número de variables
k = X.shape[1]
X_res, y_res = sm.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.332, stratify=y_res)

for _ in range(100):
    pipeline = Pipeline([
          ('scaler', StandardScaler()),    # Escalado de características
          ('reduce_dim', 'passthrough'),   # Placeholder para la reducción de dimensionalidad
          ('classification',MLPClassifier(verbose=False))
  ])

  # Definir los parámetros para GridSearch
    param_grid = [
          {
              'reduce_dim': [SelectKBest(score_func=mutual_info_classif)],
              'reduce_dim__k': [k//2],
              'classification__learning_rate_init': [0.01],
              'classification__max_iter': [2000],
              'classification__momentum': [0.64],
              'classification__solver': ['adam'],
              'classification__hidden_layer_sizes':[ (4*k, 2*k)]
          }
  ]

  # Configurar el clasificador con diferentes tamaños de capa oculta
    mlp_clf = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='accuracy')

    mlp_clf.fit(X_train, y_train)

    test_pred = mlp_clf.predict(X_test)
    accuracy = accuracy_score(y_test, test_pred)
    accuracys=accuracys+accuracy

results1.append(("MLPClassifier", accuracys/100))

In [15]:
accuracys=0.0
file_path = os.path.join(folder_path, "Sel7Pre10.csv")
df = pd.read_csv(file_path, sep=",", header=0, index_col=0)
df = df.dropna()
df = df.sample(n=500, replace=True,random_state=42)

X = df.drop("Prevalencia", axis=1)
y = df["Prevalencia"]

# Determinar 'k', el número de variables
k = X.shape[1]

X_res, y_res = sm.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.332, stratify=y_res)

for _ in range(100):
  # Definir el pipeline con pasos vacíos
  pipeline = Pipeline([
          ('scaler', StandardScaler()),    # Escalado de características
          ('reduce_dim', 'passthrough'),   # Placeholder para la reducción de dimensionalidad
          ('classification', KNeighborsClassifier())        # Clasificador KNN
      ])

  # Definir los parámetros para GridSearch
  param_grid = [
          {
              'reduce_dim': [SelectKBest(score_func=f_classif)],
              'reduce_dim__k': [k//2] ,
              'classification__weights': ['distance'],
              'classification__n_neighbors': [k//8],
              'classification__p': [1]
          }
  ]
  # Configurar el clasificador con diferentes tamaños de capa oculta
  knn_clf = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='accuracy')

  knn_clf.fit(X_train, y_train)

  test_pred = knn_clf.predict(X_test)
  accuracy = accuracy_score(y_test, test_pred)
  accuracys=accuracys+accuracy
results1.append(("KNeighborsClassifier", accuracys/100))

In [16]:
results1=pd.DataFrame(results1)
results1=results1.sort_values(by=1,ascending=False)
results1.reset_index(drop=True)

Unnamed: 0,0,1
0,XGB,0.883357
1,SVC,0.879859
2,RandomForest,0.867668
3,GradientBoosting,0.850954
4,KNeighborsClassifier,0.844523
5,MLPClassifier,0.843004
