In [None]:
%pip install pandas numpy matplotlib scikit-learn

## **Modelo de Regresión Logística**
Este notebook entrena un modelo **LogisticRegression** usando el dataset de cáncer de mama: "Breast Cancer Wisconsin (Diagnostic) Data Set".


***Modelo Implementado manualmente***

In [8]:
import pandas as pd
import numpy as np
import time as t
import random
from printer import printCasos

df = pd.read_csv("/home/angelagar/MCDO/breast-cancer.csv")
y = df["diagnosis"]
x = df.iloc[:, 2:]

y = y.map({'M': 1, 'B': 0})
x = (x - x.mean()) / x.std()
yneg = sum(1 for val in y if val != 1)
#print(yneg)

inter = 0.0
vars_x = x.shape[1]
loops = 1000
a = 0.01
l_reg = 0.01
lossi= 1e-4

m = len(y)
idx = list(range(m))
random.seed(56)
random.shuffle(idx)

split = int(0.7 * m)
x_train, y_train = x.iloc[idx[:split]], y.iloc[idx[:split]]
x_test,  y_test  = x.iloc[idx[split:]], y.iloc[idx[split:]]
wi = [random.uniform(-1, 1) for _ in range(vars_x)]

def sigmoide(z): return 1 / (1 + np.exp(-z))

numM = (y_train == 1).sum()
numB = (y_train == 0).sum()
wm = len(y_train) / (2 * numM)
wb = len(y_train) / (2 * numB)

def entrenar(x, y, wi, inter, loops, a, m, l_reg, lossi):
    prevloss = float('inf')
    for loop in range(loops + 1):
        gpeso = [0.0] * vars_x
        ginter = 0.0
        for i, row in x.iterrows():
            z_val = inter + sum(row[col] * wi[j] for j, col in enumerate(x.columns))
            ypre = sigmoide(z_val)
            err = ypre - y[i]
            if y[i] == 1: err *= wm
            else: err *= wb
            ginter += err
            for j, col in enumerate(x.columns):
                gpeso[j] += err * row[col] + l_reg * wi[j]

        ginter /= m
        gpeso = [g / m for g in gpeso]
        inter -= a * ginter
        wi = [w - a * grad for w, grad in zip(wi, gpeso)]
        

        if loop % 50 == 0 or loop == loops:
            z_list = [inter + sum(row[col] * w for w, col in zip(wi, x.columns)) for _, row in x.iterrows()]
            prob_list = [sigmoide(z) for z in z_list]
            loss = -np.mean([y_i*np.log(p+1e-8) + (1-y_i)*np.log(1-p+1e-8) for y_i,p in zip(y, prob_list)])
            accuracy = sum((1 if p >= 0.5 else 0) == y_i for p, y_i in zip(prob_list, y)) / len(y) * 100
            print(f"Iteración {loop}: pérdida = {loss:.5f}, exactitud = {accuracy:.4f}%")

            if lossi is not None and abs(prevloss - loss) < lossi:
                print(f"Convergencia alcanzada en iteración {loop}, con pérdida de {loss:.5f}")
                break
            prevloss=loss
        
    return wi, inter

def predecir(x, wi, inter):
    z_list = [inter + sum(row[col] * w for w, col in zip(wi, x.columns)) for _, row in x.iterrows()]
    probabilidades = [sigmoide(val) for val in z_list]
    predicciones = [1 if p >= 0.5 else 0 for p in probabilidades]
    return probabilidades, predicciones

def predecir_caso(fila, wi, inter):
    caso = inter + sum(fila[col] * w for w, col in zip(wi, x.columns))
    prob = sigmoide(caso)
    pred = "Maligno" if prob >= 0.5 else "Benigno"
    return prob, pred

timer1 = t.time()
wi, inter = entrenar(x_train, y_train, wi, inter, loops, a, len(y_train), l_reg, lossi)
timer2 = t.time()
timertot = timer2 - timer1
print(f"Tardo en entrenar {timertot:.5f} segundos")

prob_train, pred_train = predecir(x_train, wi, inter)
prob_test, pred_test = predecir(x_test, wi, inter)

exact_train = sum(pred == true for pred, true in zip(pred_train, y_train)) / len(y_train)
exact_test = sum(pred == true for pred, true in zip(pred_test, y_test)) / len(y_test)

print(f"Exactitud en train: {exact_train*100:.2f}%")
print(f"Exactitud en test: {exact_test*100:.2f}%")


indices = [41, 106, 484, 538]
ruta= f"/home/angelagar/MCDO/curva{loops}loops.png"
printCasos(wi, inter, x, y, indices, ruta, loops)


Iteración 0: pérdida = 1.42051, exactitud = 28.3208%
Iteración 50: pérdida = 0.55715, exactitud = 73.9348%
Iteración 100: pérdida = 0.37538, exactitud = 85.7143%
Iteración 150: pérdida = 0.29738, exactitud = 88.7218%
Iteración 200: pérdida = 0.25253, exactitud = 90.2256%
Iteración 250: pérdida = 0.22269, exactitud = 91.4787%
Iteración 300: pérdida = 0.20113, exactitud = 91.7293%
Iteración 350: pérdida = 0.18472, exactitud = 91.9799%
Iteración 400: pérdida = 0.17182, exactitud = 92.7318%
Iteración 450: pérdida = 0.16140, exactitud = 93.2331%
Iteración 500: pérdida = 0.15281, exactitud = 93.9850%
Iteración 550: pérdida = 0.14559, exactitud = 94.7368%
Iteración 600: pérdida = 0.13943, exactitud = 94.9875%
Iteración 650: pérdida = 0.13410, exactitud = 95.2381%
Iteración 700: pérdida = 0.12943, exactitud = 95.2381%
Iteración 750: pérdida = 0.12531, exactitud = 95.4887%
Iteración 800: pérdida = 0.12164, exactitud = 95.7393%
Iteración 850: pérdida = 0.11835, exactitud = 95.7393%
Iteración 900

***Modelo de Regresión Logística con Scikit-Learn***

In [9]:
import pandas as pd
import numpy as np
import random
import time as t
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss

df = pd.read_csv("/home/angelagar/MCDO/breast-cancer.csv")
y = df["diagnosis"].map({'M': 1, 'B': 0})
x = df.iloc[:, 2:]

scaler = StandardScaler()
x_scaled = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)

m = len(y)
idx = list(range(m))
random.seed(56)
random.shuffle(idx)
split = int(0.7 * m)

x_train, y_train = x_scaled.iloc[idx[:split]], y.iloc[idx[:split]]
x_test,  y_test  = x_scaled.iloc[idx[split:]], y.iloc[idx[split:]]

loops = 10000
start = t.time()

model = LogisticRegression(
    solver='saga',
    penalty='l2',
    C=1/0.01,
    max_iter=loops,
    tol=1e-4,
    random_state=56
)

model.fit(x_train, y_train)
end = t.time()

prob_train = model.predict_proba(x_train)[:, 1]
prob_test = model.predict_proba(x_test)[:, 1]
pred_train = (prob_train >= 0.5).astype(int)
pred_test = (prob_test >= 0.5).astype(int)

exact_train = accuracy_score(y_train, pred_train)
exact_test = accuracy_score(y_test, pred_test)
loss_train = log_loss(y_train, prob_train)
loss_test = log_loss(y_test, prob_test)

print(f"Tardó en entrenar {end-start:.5f} segundos")
print(f"Pérdida train: {loss_train:.5f}, Pérdida test: {loss_test:.5f}")
print(f"Exactitud train: {exact_train*100:.2f}%")
print(f"Exactitud test: {exact_test*100:.2f}%")

indices = [41, 106, 484, 538]
for i in indices:
    fila = x_scaled.iloc[i]
    z = model.intercept_[0] + np.dot(model.coef_[0], fila)
    prob = 1 / (1 + np.exp(-z))
    pred = "Maligno" if prob >= 0.5 else "Benigno"
    real = "Maligno" if y[i] == 1 else "Benigno"
    print(f"Fila {i+1}: Probabilidad={prob*100:.4f}%, Predicción={pred}, Real={real}")



Tardó en entrenar 0.72449 segundos
Pérdida train: 0.01025, Pérdida test: 0.29506
Exactitud train: 99.75%
Exactitud test: 95.91%
Fila 42: Probabilidad=99.8199%, Predicción=Maligno, Real=Maligno
Fila 107: Probabilidad=0.2473%, Predicción=Benigno, Real=Benigno
Fila 485: Probabilidad=0.0421%, Predicción=Benigno, Real=Benigno
Fila 539: Probabilidad=0.0000%, Predicción=Benigno, Real=Benigno
