<a href="https://colab.research.google.com/github/Yur58/Fundamentos-de-Inteligencia-artificial/blob/main/ACTIVIDAD%203/Train_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [4]:
# Importar librerías
from ucimlrepo import fetch_ucirepo #dataset del repositorio UCI
from sklearn.model_selection import train_test_split #dividir datos en entrenamiento y pruea
from sklearn.linear_model import LogisticRegression #modelo regresión logis
from sklearn.ensemble import RandomForestClassifier #modelo Random Forest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix #metricas
import pandas as pd

_random_state = 40 #semilla para repoducibilidad de resultados

# Obtener el dataset Adult (UCI id=2)
adult = fetch_ucirepo(id=2)

# Datos
X = adult.data.features #variables predictoras (características)
y = adult.data.targets.iloc[:, 0]  # única columna del target

y = y.str.strip().str.replace(".", "", regex=False) #elimina espacios y puntos
# Convertir variables categóricas a numéricas (one-hot encoding)
X_encoded = pd.get_dummies(X) #convierte variables categoricas en numericas

print("Shape original:", X.shape) #dimensiones orginales
print("Shape codificado:", X_encoded.shape)#dimensiones de codificación
print("Clases en y:", y.unique())#clases

#################################
# Caso A: Logistic Regression

X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=_random_state,
    stratify=y #mantiene proporción de clases en ambos conjuntos
)

model_A = LogisticRegression(max_iter=200, solver="liblinear")#modelo con par´smetros
model_A.fit(X_train_A, y_train_A)

y_pred_A = model_A.predict(X_test_A)

# Métricas
acc_A = accuracy_score(y_test_A, y_pred_A)
prec_A = precision_score(y_test_A, y_pred_A, average='weighted')
rec_A = recall_score(y_test_A, y_pred_A, average='weighted')
f1_A = f1_score(y_test_A, y_pred_A, average='weighted')
#matriz de onfusión
cm_A = confusion_matrix(y_test_A, y_pred_A)
cm_df_A = pd.DataFrame(cm_A, index=sorted(y.unique()), columns=sorted(y.unique()))

print("-"*30)
print("CASO A: Logistic Regression")
print("Accuracy:", acc_A)
print("Precision (ponderada):", prec_A)
print("Recall (ponderado):", rec_A)
print("F1-score (ponderado):", f1_A)
print("\nMatriz de confusión:")
print(cm_df_A)
print("")

#################################
# Caso B: Random Forest

X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=_random_state,
    stratify=y
)

model_B = RandomForestClassifier(n_estimators=200, random_state=_random_state)
model_B.fit(X_train_B, y_train_B)

y_pred_B = model_B.predict(X_test_B)

# Métricas
acc_B = accuracy_score(y_test_B, y_pred_B)
prec_B = precision_score(y_test_B, y_pred_B, average='weighted')
rec_B = recall_score(y_test_B, y_pred_B, average='weighted')
f1_B = f1_score(y_test_B, y_pred_B, average='weighted')

cm_B = confusion_matrix(y_test_B, y_pred_B)
cm_df_B = pd.DataFrame(cm_B, index=sorted(y.unique()), columns=sorted(y.unique()))#convierte a DataFrame

print("-"*30)
print("CASO B: Random Forest Classifier")
print("Accuracy:", acc_B)
print("Precision (ponderada):", prec_B)
print("Recall (ponderado):", rec_B)
print("F1-score (ponderado):", f1_B)
print("\nMatriz de confusión:")
print(cm_df_B)
print("")

Shape original: (48842, 14)
Shape codificado: (48842, 108)
Clases en y: ['<=50K' '>50K']
------------------------------
CASO A: Logistic Regression
Accuracy: 0.8022315487767427
Precision (ponderada): 0.7919015687976714
Recall (ponderado): 0.8022315487767427
F1-score (ponderado): 0.7648361648623994

Matriz de confusión:
       <=50K  >50K
<=50K   7212   219
>50K    1713   625

------------------------------
CASO B: Random Forest Classifier
Accuracy: 0.8572013512130208
Precision (ponderada): 0.8519273026314327
Recall (ponderado): 0.8572013512130208
F1-score (ponderado): 0.8532390347095048

Matriz de confusión:
       <=50K  >50K
<=50K   6902   529
>50K     866  1472

