###  Ejercicio en clase - Pipeline Experiment

#### Diana Arias

In [9]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression
import random
from sklearn.model_selection import train_test_split

In [5]:
def generar_datos():
    # Generar datos de ejemplo
    np.random.seed(0)
    n = 15 # Número de registros
    x1 = np.random.randint(1, 16, n)  # Valores aleatorios para x1 (enteros de 1 a 15)
    x2 = np.random.randint(1, 16, n)  # Valores aleatorios para x2 (enteros de 1 a 15)
    
    # Calcular y utilizando la fórmula especificada
    y = 5 * x1 + 2 * np.log(x2)
    
    # Crear un DataFrame con los datos generados
    df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})
    
    return df
df = generar_datos()
df

Unnamed: 0,x1,x2,y
0,13,9,69.394449
1,6,13,35.129899
2,1,11,9.795791
3,4,2,21.386294
4,12,7,63.89182
5,4,8,24.158883
6,8,8,44.158883
7,10,15,55.4161
8,4,9,24.394449
9,6,2,31.386294


In [10]:
train_X, test_X, train_y, test_y = train_test_split(df.drop('y', axis=1), df['y'], test_size=0.3, random_state=42)
test_X

Unnamed: 0,x1,x2
9,6,2
11,5,10
0,13,9
13,7,9
5,4,8


In [11]:
# Se verifica si la regresión predice correctamente

m1 = LinearRegression()
fit1 = m1.fit(train_X, train_y)
preds = fit1.predict(test_X)
print(f"\n{preds}")
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds))}\n")


[32.17372276 29.39243007 69.17120644 39.12845232 23.82884613]
RMSE: 0.4230408644107437



In [12]:
# Creamos una clase transformadora
class ExperimentalTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    print('\n>>>>>>>init() called.\n')

  def fit(self, X, y = None):
    print('\n>>>>>>>fit() called.\n')
    return self

  def transform(self, X, y = None):
    print('\n>>>>>>>transform() called.\n')
    X_ = X.copy() # creamos una copia para evitar cambios en el conjunto de datos original
    X_.x2 = 2 * np.log(X_.x2)
    return X_

In [13]:
# sin transformación de entrada - para validar que obtenemos los mismos resultados que antes


print("create pipeline 1")
pipe1 = Pipeline(steps=[
                       ('linear_model', LinearRegression())
])

print("fit pipeline 1")
pipe1.fit(train_X, train_y)

print("predict via pipeline 1")
preds1 = pipe1.predict(test_X)

print(f"\n{preds1}")  # should be [13.72113586 16.93334467]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds1))}\n")

create pipeline 1
fit pipeline 1
predict via pipeline 1

[32.17372276 29.39243007 69.17120644 39.12845232 23.82884613]
RMSE: 0.4230408644107437



In [14]:
# con transformation de entradas

print("create pipeline 2")
pipe2 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer()),    # esto dispara una llamda a __init__
                       ('linear_model', LinearRegression())
])

# una sintaxis alternativa y más corta para hacer lo anterior, sin nombrar cada paso, es:
#pipe2 = make_pipeline(ExperimentalTransformer(), LinearRegression())

print("fit pipeline 2")
pipe2.fit(train_X, train_y)

print("predict via pipeline 2")
preds2 = pipe2.predict(test_X)

print(f"\n{preds2}")  # should be [14. 17.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds2))}\n")


create pipeline 2

>>>>>>>init() called.

fit pipeline 2

>>>>>>>fit() called.


>>>>>>>transform() called.

predict via pipeline 2

>>>>>>>transform() called.


[31.38629436 29.60517019 69.39444915 39.39444915 24.15888308]
RMSE: 1.2409095402129097e-14

