# penguins: body_mass_g

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split #particionamiento de datos
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
df = sns.load_dataset('penguins')
df.dropna(inplace=True)
df.head()

In [None]:
sns.pairplot(df)

In [4]:
#Separar X e y
X= df[['flipper_length_mm', 'bill_depth_mm', 'bill_length_mm']]
y= df['body_mass_g']
#Dataframe de resultados
df_resultados = pd.DataFrame(columns=['Modelo', 'R2', 'MAE', 'RMSE', 'MAPE'])


In [None]:
#Regresion lineal multiple
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
rmse = root_mean_squared_error(y, y_pred)
mape = mean_absolute_percentage_error(y, y_pred) #(multiplicar * 100) error promedio de 7,7 % respecto del valor real de la masa corporal
df_resultados.loc[len(df_resultados)] = ['RLM 3col', r2, mae, rmse, mape]
df_resultados

In [None]:
df = pd.get_dummies(df)
df.head()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr().round(2), annot=True)

In [None]:
#correlaciones de cada columna con body_mass_g
plt.figure(figsize=(12, 5))
sns.barplot(df.corr()['body_mass_g'].drop('body_mass_g').sort_values())
plt.xticks(rotation=45)

In [None]:
X= df.drop('body_mass_g', axis=1)
y= df['body_mass_g']

#Regresion lineal multiple
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
rmse = root_mean_squared_error(y, y_pred)
mape = mean_absolute_percentage_error(y, y_pred) # error promedio de 5,5 % respecto del valor real de la masa corporal
df_resultados.loc[len(df_resultados)] = ['RLM 11col', r2, mae, rmse, mape]
df_resultados

In [None]:
#quitar columna sexo para con first_drop para evitar la correlacion -1 o multicolinealidad 
# observamos que da los mismos resultados que un get_dummies normales, lo que sugiere 
# que es mejor hacerlo con drop_first porque da menos columnas y el modelo es mas liviano 
# para entrenar
df = sns.load_dataset('penguins')
df.dropna(inplace=True)
df = pd.get_dummies(df, drop_first=True)
X= df.drop('body_mass_g', axis=1)
y= df['body_mass_g']

#Regresion lineal multiple
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
rmse = root_mean_squared_error(y, y_pred)
mape = mean_absolute_percentage_error(y, y_pred) # error promedio de 5,5 % respecto del valor real de la masa corporal
df_resultados.loc[len(df_resultados)] = ['RLM 8col', r2, mae, rmse, mape]
df_resultados

KNN es un algoritmo computacionalmente costoso, ya que es lazy o perezoso, realiza los calculos en el predict, no existe un entrenamiento pesado como en otros metodos, el modelo simplemente almacenas los datos en el entrenamiento, y para hacer una prediccion (regresion o clasificacion) busca y calcula los K vecinos mas cercanos

In [None]:
# KNN - K Nearest Neighbors
df = sns.load_dataset('penguins')
df.dropna(inplace=True)
df = pd.get_dummies(df, drop_first=True)

X= df.drop('body_mass_g', axis=1)
y= df['body_mass_g']

#Regresion lineal multiple
model = KNeighborsRegressor()
model.fit(X, y)
y_pred = model.predict(X)

y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
rmse = root_mean_squared_error(y, y_pred)
mape = mean_absolute_percentage_error(y, y_pred) # error promedio de 5,5 % respecto del valor real de la masa corporal
df_resultados.loc[len(df_resultados)] = ['Knn K = 5', r2, mae, rmse, mape]
df_resultados

In [None]:
for k in range(1, 2):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X, y)
    y_pred = model.predict(X)

    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    rmse = root_mean_squared_error(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred) 
    
    df_resultados.loc[len(df_resultados)] = [f'Knn K = {k}', r2, mae, rmse, mape] 

df_resultados.sort_values('MAPE', inplace=True)
df_resultados

In [13]:
#escalado y volver a probar KNN
#KNN es sensible al escalado

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

for k in range(2, 3):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_scaled, y)

    y_pred = model.predict(X_scaled)

    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    rmse = root_mean_squared_error(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred) 
    
    df_resultados.loc[len(df_resultados)] = [f'Knn scaled K = {k}', r2, mae, rmse, mape] 



In [None]:
df_resultados.sort_values('MAE', ascending=True)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2= r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

df_resultados.loc[len(df_resultados)] = ['RLM all test', r2, mae, rmse, mape]
df_resultados

In [None]:
df_resultados.sort_values('MAPE', ascending=True)


In [None]:
model = KNeighborsRegressor(n_neighbors=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2= r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

df_resultados.loc[len(df_resultados)] = ['RLM all test', r2, mae, rmse, mape]
df_resultados

In [None]:
# Particionamiento con escalado para KNN MinMAxScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

scaler = MinMaxScaler()
#introduce data leakage o fuga de datos
#scaler.fit_transform(X)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for k in range(2, 10):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)

    r2= r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    df_resultados.loc[len(df_resultados)] = [f'RLM KNN scaled k = {k} (test)', r2, mae, rmse, mape]
df_resultados    

In [None]:
df_resultados.sort_values('MAPE', ascending=False)[df_resultados['Modelo'].str.contains('(test)')]

In [None]:
df_resultados.sort_values('R2', ascending=False)


In [25]:
#STANDARDSCALER en regresion lineal multiple
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

#aplicar estandarizacion
scaler= StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred) # error del 5,5 % con respecto de valor real

df_resultados.loc[len(df_resultados)] = ['RLM 8 col scaled (test)', r2, mae, rmse, mape] # Está sería la mejor opción con 8 columnas nos da igual resultado que con 11 columnas


In [26]:
df_resultados.sort_values('R2', ascending=False)

Unnamed: 0,Modelo,R2,MAE,RMSE,MAPE
4,Knn K = 1,1.0,0.0,0.0,0.0
5,Knn scaled K = 2,0.938673,157.057057,199.106488,0.038662
16,RLM 8 col scaled (test),0.896169,196.208892,255.749079,0.048622
3,Knn K = 5,0.876481,220.42042,282.569718,0.054214
1,RLM 11col,0.875223,226.413652,284.005204,0.055982
2,RLM 8col,0.875223,226.413652,284.005204,0.055982
6,RLM all test,0.87328,208.168496,271.558568,0.052835
15,RLM KNN scaled k = 9 (test),0.867663,211.888889,277.512095,0.054886
9,RLM KNN scaled k = 3 (test),0.863136,216.666667,282.218395,0.055225
12,RLM KNN scaled k = 6 (test),0.857951,221.75,287.515096,0.056856


In [27]:
df_resultados.sort_values('R2', ascending=False)[df_resultados['Modelo'].str.contains('(test)')]

  df_resultados.sort_values('R2', ascending=False)[df_resultados['Modelo'].str.contains('(test)')]
  df_resultados.sort_values('R2', ascending=False)[df_resultados['Modelo'].str.contains('(test)')]


Unnamed: 0,Modelo,R2,MAE,RMSE,MAPE
16,RLM 8 col scaled (test),0.896169,196.208892,255.749079,0.048622
6,RLM all test,0.87328,208.168496,271.558568,0.052835
15,RLM KNN scaled k = 9 (test),0.867663,211.888889,277.512095,0.054886
9,RLM KNN scaled k = 3 (test),0.863136,216.666667,282.218395,0.055225
12,RLM KNN scaled k = 6 (test),0.857951,221.75,287.515096,0.056856
13,RLM KNN scaled k = 7 (test),0.856686,225.214286,288.791907,0.058042
11,RLM KNN scaled k = 5 (test),0.856384,225.7,289.096005,0.057532
14,RLM KNN scaled k = 8 (test),0.856043,225.0,289.439788,0.058026
10,RLM KNN scaled k = 4 (test),0.853799,226.75,291.686904,0.057737
8,RLM KNN scaled k = 2 (test),0.834784,250.25,310.075596,0.063747


CART Arbol de decision para regresion
* Facil interpretabilidad, scikit nos muestra el arbol resultante en textto o imagen
* No necesita el escalado
* no le afectan tanto los outliers, deberia afectarle menos que a un knn
* hace automaticamente los cortes umbrales para decidir los nodos
* Inportante: ajustar la profindidad para evitar el overfitting o sobreajuste
* Por sí solo puede ser débil, suele usarse en combinación de muchos arboles coreando asi un Random Forest
* Sencible a cambio en los umbrales, si hay pocos datos


In [None]:
#sin particionamiento
from sklearn.tree import DecisionTreeRegressor
# 2 a 10 niveles de profundidad

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

model = DecisionTreeRegressor()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred) 

df_resultados.loc[len(df_resultados)] = ['Tree (test)', r2, mae, rmse, mape] 
