## Importing necessary libraries

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Data loading

In [13]:
# Cargar datos del archivo csv
df = pd.read_csv('Smoker_Epigenetic.csv')

# Mostrar los primeros 5 registros
df.head()

Unnamed: 0,GSM,Smoking Status,Gender,Age,cg00050873,cg00212031,cg00213748,cg00214611,cg00455876,cg01707559,...,cg02494853,cg02839557,cg02842889,cg03052502,cg03155755,cg03244189,cg03443143,cg03683899,cg03695421,cg03706273
0,GSM1051525,current,f,67,0.607563,0.422843,0.372455,0.621562,0.290777,0.267143,...,0.06707,0.246993,0.46924,0.400247,0.415031,0.221433,0.475826,0.207724,0.209197,0.129983
1,GSM1051526,current,f,49,0.345054,0.568662,0.5006,0.498607,0.374591,0.190274,...,0.046939,0.236742,0.307467,0.377031,0.397371,0.217122,0.544469,0.184446,0.193773,0.098533
2,GSM1051527,current,f,53,0.32135,0.360909,0.352732,0.373824,0.230674,0.314705,...,0.038237,0.244612,0.357753,0.305044,0.521278,0.185049,0.53706,0.393123,0.268003,0.040248
3,GSM1051528,current,f,62,0.277268,0.304437,0.475235,0.486258,0.295181,0.295793,...,0.026716,0.001641,0.445739,0.271475,0.434492,0.165419,0.507917,0.281209,0.217857,0.101516
4,GSM1051529,never,f,33,0.413599,0.131251,0.367545,0.761167,0.23577,0.250526,...,0.037016,0.33432,0.39504,0.326553,0.430097,0.181135,0.405479,0.310794,0.280071,0.077857


## Data preparation

In [14]:
# Eliminar la columna 'GSM' ya que no es relevante para el análisis
df = df.drop(['GSM'], axis=1)

# Normalizar los valores de la columna 'Género' para que sean uniformes
df['Gender'] = df['Gender'].str.lower()

# Imputación de valores faltantes
imp = SimpleImputer(strategy='mean')
df.iloc[:, 3:] = imp.fit_transform(df.iloc[:, 3:])

# Codificación de variables categóricas
label_encoder = LabelEncoder()
df['Smoking Status'] = label_encoder.fit_transform(df['Smoking Status'])
df['Gender'] = label_encoder.fit_transform(df['Gender'])

In [15]:
#Verificar el balance de clases
print(df['Smoking Status'].value_counts())

0    490
1    193
Name: Smoking Status, dtype: int64


In [16]:
# Verificar los valores codificados
print(df['Gender'].unique())  # Debería mostrar 0 y 1
print(df['Smoking Status'].unique())

df.head()

[0 1]
[0 1]


Unnamed: 0,Smoking Status,Gender,Age,cg00050873,cg00212031,cg00213748,cg00214611,cg00455876,cg01707559,cg02004872,...,cg02494853,cg02839557,cg02842889,cg03052502,cg03155755,cg03244189,cg03443143,cg03683899,cg03695421,cg03706273
0,0,0,67,0.607563,0.422843,0.372455,0.621562,0.290777,0.267143,0.179144,...,0.06707,0.246993,0.46924,0.400247,0.415031,0.221433,0.475826,0.207724,0.209197,0.129983
1,0,0,49,0.345054,0.568662,0.5006,0.498607,0.374591,0.190274,0.155977,...,0.046939,0.236742,0.307467,0.377031,0.397371,0.217122,0.544469,0.184446,0.193773,0.098533
2,0,0,53,0.32135,0.360909,0.352732,0.373824,0.230674,0.314705,0.105745,...,0.038237,0.244612,0.357753,0.305044,0.521278,0.185049,0.53706,0.393123,0.268003,0.040248
3,0,0,62,0.277268,0.304437,0.475235,0.486258,0.295181,0.295793,0.111286,...,0.026716,0.001641,0.445739,0.271475,0.434492,0.165419,0.507917,0.281209,0.217857,0.101516
4,1,0,33,0.413599,0.131251,0.367545,0.761167,0.23577,0.250526,0.169108,...,0.037016,0.33432,0.39504,0.326553,0.430097,0.181135,0.405479,0.310794,0.280071,0.077857


## Splitting the data into training and test sets

In [17]:
# Definir variables predictoras y objetivo
X = df.drop(['Smoking Status'], axis=1)
y = df['Smoking Status']

# División de los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Feature selection and reduction of attributes

In [18]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

# Variance Threshold
selector = VarianceThreshold(threshold=0.0)
X_train = selector.fit_transform(X_train)
X_test = selector.transform(X_test)

# SelectKBest
k_best = SelectKBest(f_classif, k=10)
X_train = k_best.fit_transform(X_train, y_train)
X_test = k_best.transform(X_test)

# PCA
pca = PCA(n_components=10)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


## Training and Evaluation of the Base Model

In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Modelo de Regresión Logística para Clasificación
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluación del modelo base
metrics_log_reg = {
    'Exactitud': accuracy_score(y_test, y_pred),
    'Precisión': precision_score(y_test, y_pred, average='weighted'),
    'Recall': recall_score(y_test, y_pred, average='weighted'),
    'F1-Score': f1_score(y_test, y_pred, average='weighted')
}

## Optimization and Evaluation of the Model

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Modelos Avanzados
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Búsqueda de hiperparámetros
param_grid_rf = {
    'n_estimators': [100, 200, 300], 
    'max_depth': [None, 10, 20, 30]
}
param_grid_svm = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}

grid_rf = GridSearchCV(models['Random Forest'], param_grid_rf, cv=5)
grid_svm = GridSearchCV(models['SVM'], param_grid_svm, cv=5)

grid_rf.fit(X_train, y_train)
grid_svm.fit(X_train, y_train)

# Evaluación de los mejores modelos
best_rf = grid_rf.best_estimator_
best_svm = grid_svm.best_estimator_

y_pred_rf = best_rf.predict(X_test)
y_pred_svm = best_svm.predict(X_test)

metrics_rf = {
    'Exactitud': accuracy_score(y_test, y_pred_rf),
    'Precisión': precision_score(y_test, y_pred_rf, average='weighted'),
    'Recall': recall_score(y_test, y_pred_rf, average='weighted'),
    'F1-Score': f1_score(y_test, y_pred_rf, average='weighted')
}

metrics_svm = {
    'Exactitud': accuracy_score(y_test, y_pred_svm),
    'Precisión': precision_score(y_test, y_pred_svm, average='weighted'),
    'Recall': recall_score(y_test, y_pred_svm, average='weighted'),
    'F1-Score': f1_score(y_test, y_pred_svm, average='weighted')
}

In [21]:
# Crear un DataFrame para comparar las métricas
df_metrics = pd.DataFrame([metrics_log_reg, metrics_rf, metrics_svm], 
                          index=['Regresión Logística', 'Random Forest (Optimizado)', 'SVM (Optimizado)'])

# Mostrar la tabla comparativa
print(df_metrics)

                            Exactitud  Precisión    Recall  F1-Score
Regresión Logística          0.737226   0.753395  0.737226  0.655122
Random Forest (Optimizado)   0.737226   0.710399  0.737226  0.704086
SVM (Optimizado)             0.729927   0.704712  0.729927  0.659301
