# importaciones

In [116]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import mglearn
import pandas as pd
import numpy as np

# 1. Ejercicio 1

## 1.1. Con GridSearch y Pipelines

### 1.1.1. Breast Cancer

In [117]:
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=10)

#### 1.1.1.1. K-NN

In [118]:
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid_knn = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance']
}

grid_search_knn = GridSearchCV(knn_pipeline, param_grid_knn, scoring=make_scorer(recall_score), cv=5)
grid_search_knn.fit(X_train, y_train)
best_knn = grid_search_knn.best_estimator_

#### 1.1.1.2. Logistic Regression

In [119]:
logreg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='liblinear'))
])

param_grid_logreg = {
    'logreg__C': [0.01, 0.1, 1, 10, 100]
}

grid_search_logreg = GridSearchCV(logreg_pipeline, param_grid_logreg, scoring=make_scorer(recall_score), cv=5)
grid_search_logreg.fit(X_train, y_train)
best_logreg = grid_search_logreg.best_estimator_

#### 1.1.1.3. Resultados

In [120]:
# Evaluación de K-NN
y_pred_knn = best_knn.predict(X_test)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)
knn_auc = roc_auc_score(y_test, best_knn.predict_proba(X_test)[:, 1])

# Evaluación de Logistic Regression
y_pred_logreg = best_logreg.predict(X_test)
logreg_precision = precision_score(y_test, y_pred_logreg)
logreg_recall = recall_score(y_test, y_pred_logreg)
logreg_f1 = f1_score(y_test, y_pred_logreg)
logreg_auc = roc_auc_score(y_test, best_logreg.predict_proba(X_test)[:, 1])

columns = ["Modelo", "precicion", "recall", "f1 score", "AUC"]
knnResults = ["K-NN", knn_precision, knn_recall, knn_f1, knn_auc]
logRegResults = ["Logistic Regression", logreg_precision, logreg_recall, logreg_f1, logreg_auc]

#Resultados dataframe
pd.DataFrame( [knnResults, logRegResults], columns=columns)



Unnamed: 0,Modelo,precicion,recall,f1 score,AUC
0,K-NN,0.991071,0.991071,0.991071,0.998108
1,Logistic Regression,0.990991,0.982143,0.986547,0.999243


### 1.1.2. Boston Housing

In [121]:
# Cargar el dataset
X, y = mglearn.datasets.load_extended_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### 1.1.2.1. K-NN

In [122]:
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

param_grid_knn = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance']
}

grid_search_knn = GridSearchCV(knn_pipeline, param_grid_knn, scoring=make_scorer(recall_score), cv=5)
grid_search_knn.fit(X_train, y_train)
best_knn_bos = grid_search_knn.best_estimator_


#### 1.1.2.2. Linear Regression

In [123]:
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])

lr_manual = LinearRegression()
lr_manual.fit(X_train, y_train) #ojo
lr_pipeline.fit(X_train, y_train)
y_pred_lr_manual = lr_manual.predict(X_test)
lr_manual_r2 = r2_score(y_test, y_pred_lr_manual)
lr_manual_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr_manual))


#### 1.1.2.3. Resultados

In [124]:
# Función para calcular MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# K-NN
y_pred_knn_reg = best_knn_bos.predict(X_test)
knn_mape = mean_absolute_percentage_error(y_test, y_pred_knn_reg)
knn_mae = mean_absolute_error(y_test, y_pred_knn_reg)
knn_mse = mean_squared_error(y_test, y_pred_knn_reg)
knn_rmse = np.sqrt(knn_mse)
knn_r2 = r2_score(y_test, y_pred_knn_reg)

# Regresión Lineal
y_pred_lr = lr_pipeline.predict(X_test)
lr_mape = mean_absolute_percentage_error(y_test, y_pred_lr)
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_r2 = r2_score(y_test, y_pred_lr)

columns = ["Modelo", "MAPE", "MAE", "RMSE", "MSE", "R²"]
knnResults = ["K-NN", knn_mape, knn_mae, knn_mse, knn_rmse, knn_r2]
logRegResults = ["Linear Regression", lr_mape, lr_mae, lr_mse, lr_rmse, lr_r2 ]

#Resultados dataframe
pd.DataFrame( [knnResults, logRegResults], columns=columns)


Unnamed: 0,Modelo,MAPE,MAE,RMSE,MSE,R²
0,K-NN,13.718958,2.795395,18.257405,4.272868,0.754977
1,Linear Regression,17.112288,3.06495,25.25754,5.025688,0.661032


## 1.2. Manualmenteo con ciclos

### 1.2.1. Breast Cancer

#### 1.2.1.1. K-NN

In [125]:
best_score = 0
for n_neighbors in [3, 5, 7, 9]:
    knn = KNeighborsRegressor(n_neighbors=n_neighbors)        
    scores = cross_val_score(knn, X_train, y_train, cv=5) 
    score = np.mean(scores)
    if score > best_score:
        best_score = score
        best_parameters = {'n_neighbors': n_neighbors}

knn = KNeighborsRegressor(**best_parameters)
knn.fit(X_train, y_train)

#### 1.2.1.2. Logistic Regression

In [126]:
cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=10)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

best_score_logreg = 0
best_C = 0

for C in [0.01, 0.1, 1, 10, 100]:
    logreg = LogisticRegression(C=C, solver='liblinear')
    scores = cross_val_score(logreg, X_train, y_train, cv=5, scoring='recall')
    if scores.mean() > best_score_logreg:
        best_score_logreg = scores.mean()
        best_C = C

logreg_manual = LogisticRegression(C=best_C, solver='liblinear')
logreg_manual.fit(X_train, y_train)


#### 1.2.1.3. Resultados

In [127]:
# Evaluación de K-NN
y_pred_knn = best_knn.predict(X_test)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)
knn_auc = roc_auc_score(y_test, best_knn.predict_proba(X_test)[:, 1])

# Evaluación de Logistic Regression
y_pred_logreg = best_logreg.predict(X_test)
logreg_precision = precision_score(y_test, y_pred_logreg)
logreg_recall = recall_score(y_test, y_pred_logreg)
logreg_f1 = f1_score(y_test, y_pred_logreg)
logreg_auc = roc_auc_score(y_test, best_logreg.predict_proba(X_test)[:, 1])

columns = ["Modelo", "precicion", "recall", "f1 score", "AUC"]
knnResults = ["K-NN", knn_precision, knn_recall, knn_f1, knn_auc]
logRegResults = ["Logistic Regression", logreg_precision, logreg_recall, logreg_f1, logreg_auc]

#Resultados dataframe
pd.DataFrame( [knnResults, logRegResults], columns=columns)


Unnamed: 0,Modelo,precicion,recall,f1 score,AUC
0,K-NN,0.991071,0.991071,0.991071,0.998108
1,Logistic Regression,0.990991,0.982143,0.986547,0.999243


### 1.2.2. Boston Housing

#### 1.2.2.1. K-NN

In [135]:
best_k = 0
best_rmse = float('inf')

for k in [3, 5, 7, 9]:
    knn_reg = KNeighborsClassifier(n_neighbors=k)
    knn_reg.fit(X_train, y_train)
    y_pred = knn_reg.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    if rmse < best_rmse:
        best_rmse = rmse
        best_k = k

knn_manual_reg = KNeighborsClassifier(n_neighbors=best_k)
knn_manual_reg.fit(X_train, y_train)
knn_manual_r2 = r2_score(y_test, knn_manual_reg.predict(X_test))
knn_manual_rmse = np.sqrt(mean_squared_error(y_test, knn_manual_reg.predict(X_test)))

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [139]:
print(X_train.shape)
print(X_test.shape)
print(X_test_scaled.shape)

(398, 30)
(171, 30)
(171, 30)


#### 1.2.2.2 Linear Regression

In [129]:
lr_manual = LinearRegression()
lr_manual.fit(X_train, y_train)
y_pred_lr_manual = lr_manual.predict(X_test)
lr_manual_r2 = r2_score(y_test, y_pred_lr_manual)
lr_manual_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr_manual))


#### 1.2.2.3. Resultados

In [136]:
# Función para calcular MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# K-NN
y_pred_knn_reg = knn_manual_reg.predict(X_test_scaled)
knn_mape = mean_absolute_percentage_error(y_test, y_pred_knn_reg)
knn_mae = mean_absolute_error(y_test, y_pred_knn_reg)
knn_mse = mean_squared_error(y_test, y_pred_knn_reg)
knn_rmse = np.sqrt(knn_mse)
knn_r2 = r2_score(y_test, y_pred_knn_reg)

# Regresión Lineal
y_pred_lr = lr_pipeline.predict(X_test)
lr_mape = mean_absolute_percentage_error(y_test, y_pred_lr)
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_r2 = r2_score(y_test, y_pred_lr)



ValueError: X has 30 features, but StandardScaler is expecting 104 features as input.

# 2. Ejercicio 2

In [12]:
dfNeur = pd.read_csv("test.csv")

In [None]:
dfNeur