<a href="https://colab.research.google.com/github/YoaoRF/examen3/blob/main/Examen3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip -q install ucimlrepo
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

wine = fetch_ucirepo(id=109)
X = wine.data.features.copy()
y = wine.data.targets.copy()


X.columns = [c.strip().lower().replace(' ', '_').replace('-', '_') for c in X.columns]
y.columns = [c.strip().lower().replace(' ', '_') for c in y.columns]

X = X[['alcohol','alcalinity_of_ash','nonflavanoid_phenols']]
y = y['class']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")

print("Accuracy:", round(accuracy,3))
print("F1-macro:", round(f1,3))

print("\nInterpretación:")
print(f"- Accuracy {accuracy:.3f}: porcentaje total de aciertos en clasificación.")
print(f"- F1-macro {f1:.3f}: equilibrio entre precisión y recall en cada clase; "
      "un valor alto indica buen desempeño general sin ignorar clases.")


Accuracy: 0.722
F1-macro: 0.714

Interpretación:
- Accuracy 0.722: porcentaje total de aciertos en clasificación.
- F1-macro 0.714: equilibrio entre precisión y recall en cada clase; un valor alto indica buen desempeño general sin ignorar clases.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

def pregunta2():
    print("\n=== PREGUNTA 2: Breast Wisconsin / Regressión ===")

    df = pd.read_csv("./breast_wisconsin-5.csv", sep=";")

    df = df.drop(columns=["COD_identificacion_dni"])

    y = df["fractal_dimension3"]
    X = df.drop(columns=["fractal_dimension3"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae:.4f} -> mide el error promedio absoluto")
    print(f"R2: {r2:.4f} -> mide qué tan bien explica el modelo la varianza de los datos")
    print("Interpretación: mientras menor sea MAE y mayor sea R2 (cercano a 1), mejor el modelo.")

if __name__ == "__main__":
    pregunta2()




=== PREGUNTA 2: Breast Wisconsin / Regressión ===
MAE: 0.0050 -> mide el error promedio absoluto
R2: 0.8276 -> mide qué tan bien explica el modelo la varianza de los datos
Interpretación: mientras menor sea MAE y mayor sea R2 (cercano a 1), mejor el modelo.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

def pregunta3():
    print("\n=== PREGUNTA 3: AIDS Clinical / Clasificación ===")

    df = pd.read_csv("./aids_clinical-5.csv", sep=";")

    if "str2" not in df.columns:
        print("No existe la columna 'str2' en el dataset.")
        print("Columnas disponibles:", df.columns.tolist())
        return

    y = df["str2"]
    X = df.drop(columns=["str2"])
    X = pd.get_dummies(X, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    print(f"\nAccuracy: {acc:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Interpretación:")
    print("Accuracy alto indica buen desempeño general del modelo.")
    print("F1-score alto indica buen equilibrio entre precisión y recall.")

if __name__ == "__main__":
    pregunta3()


=== PREGUNTA 3: AIDS Clinical / Clasificación ===

Accuracy: 1.0000
F1-score: 1.0000
Interpretación:
Accuracy alto indica buen desempeño general del modelo.
F1-score alto indica buen equilibrio entre precisión y recall.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

def pregunta4():
    print("\n=== PREGUNTA 4: Glioma Grading / Comparación ===")
    df = pd.read_csv("./glioma_grading-4.csv", sep=";")

    y = df['Grade']
    X = df.drop(columns=['Grade'])
    for col in ['COD_identificacion_dni', 'id', 'patient_id']:
        if col in X.columns:
            X = X.drop(columns=[col])
    X = pd.get_dummies(X, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42, stratify=y
    )

    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    acc_rf = accuracy_score(y_test, y_pred_rf)
    f1_rf = f1_score(y_test, y_pred_rf, average='macro')

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    svc = SVC(kernel='rbf', random_state=42)
    svc.fit(X_train_s, y_train)
    y_pred_svc = svc.predict(X_test_s)
    acc_svc = accuracy_score(y_test, y_pred_svc)
    f1_svc = f1_score(y_test, y_pred_svc, average='macro')

    print("Random Forest -> Accuracy:", round(acc_rf, 3), "F1:", round(f1_rf, 3))
    print("SVM -> Accuracy:", round(acc_svc, 3), "F1:", round(f1_svc, 3))
    better = "Random Forest" if f1_rf > f1_svc else "SVM"
    print("Mejor algoritmo según F1:", better)

if __name__ == "__main__":
    pregunta4()


=== PREGUNTA 4: Glioma Grading / Comparación ===
Random Forest -> Accuracy: 0.72 F1: 0.681
SVM -> Accuracy: 0.732 F1: 0.703
Mejor algoritmo según F1: SVM


In [None]:
import pandas as pd
from scipy.stats import pearsonr

def pregunta5():
    print("\n=== PREGUNTA 5: Correlación AIDS ===")
    df = pd.read_csv("./aids_clinical-5.csv", sep=";")

    if not {'preanti', 'wtkg'}.issubset(df.columns):
        print("No se encuentran columnas 'preanti' o 'wtkg'")
        return

    x = pd.to_numeric(df['preanti'], errors='coerce')
    y = pd.to_numeric(df['wtkg'], errors='coerce')
    valid = x.notna() & y.notna()
    x, y = x[valid], y[valid]

    corr, pval = pearsonr(x, y)
    print(f"Correlación de Pearson: {corr:.3f}")
    print(f"p-value: {pval:.4f}")

    if corr > 0:
        sentido = "positiva"
    elif corr < 0:
        sentido = "negativa"
    else:
        sentido = "nula"

    print(f"La correlación es {sentido}.")
    print("Positiva: ambas variables aumentan juntas.")
    print("Negativa: cuando una sube, la otra tiende a bajar.")

if __name__ == "__main__":
    pregunta5()


=== PREGUNTA 5: Correlación AIDS ===
Correlación de Pearson: -0.079
p-value: 0.0002
La correlación es negativa.
Positiva: ambas variables aumentan juntas.
Negativa: cuando una sube, la otra tiende a bajar.
