In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


import seaborn as sns

set_config(display='diagram')

In [2]:
df = sns.load_dataset('mpg')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [3]:
df.drop(labels=['name'], axis=1, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [5]:
imputer = SimpleImputer(missing_values=np.nan)
df.horsepower = imputer.fit_transform(np.array(df.horsepower).reshape(-1,1))

In [6]:
x = df.iloc[:,1:]
y = df.iloc[:,0]

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y,random_state=2021)

In [8]:
set_config(display='text')

In [9]:
pipeline1 = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('R_Forest', RandomForestRegressor())
    ])

In [10]:
pipeline1

Pipeline(steps=[('scaler', StandardScaler()),
                ('R_Forest', RandomForestRegressor())])

In [11]:
set_config(display='diagram')

In [12]:
x_train_little = x_train[['cylinders', 'displacement']]

In [13]:
pipeline1.fit(x_train_little, y_train)

In [14]:
y_pred_pipe1 = pipeline1.predict(x_train_little)

In [15]:
print(f'Train MAE:\n\t{mean_absolute_error(y_train, y_pred_pipe1)}')

Train MAE:
	2.375787090391159


In [16]:
df = sns.load_dataset('mpg')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [17]:
df.drop(labels=['name'], axis=1, inplace=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [19]:
x = df.iloc[:,1:]
y = df.iloc[:,0]

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y,random_state=2021)

In [21]:
numerical_cols = x_train.select_dtypes('number').columns.tolist()
categorical_cols = x_train.select_dtypes('object').columns.tolist()

In [22]:
categorical_cols

['origin']

In [23]:
num_pipe = Pipeline([
    ('Imputer', SimpleImputer(missing_values=np.nan)),
    ('Scaler' , StandardScaler())
])

In [24]:
cat_pipe = Pipeline([
    ('OHE', OneHotEncoder(handle_unknown='ignore'))
])

In [25]:
transformer = ColumnTransformer(transformers=[
    ('Numerical', num_pipe, numerical_cols),
    ('Categorical', cat_pipe, categorical_cols)
])

In [28]:
pipeline2 = Pipeline(steps=[
    ('Column Transformer', transformer),
    ('Regressor', RandomForestRegressor(random_state=2021))
    ])
pipeline2

In [314]:
print(pipeline2)

Pipeline(steps=[('Column Transformer',
                 ColumnTransformer(transformers=[('Numerical',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer()),
                                                                  ('Scaler',
                                                                   StandardScaler())]),
                                                  ['cylinders', 'displacement',
                                                   'horsepower', 'weight',
                                                   'acceleration',
                                                   'model_year']),
                                                 ('Categorical',
                                                  Pipeline(steps=[('OHE',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                

In [315]:
pipeline2.fit(x_train, y_train)

[Pipeline]  (step 1 of 2) Processing Column Transformer, total=   0.0s
[Pipeline] ......... (step 2 of 2) Processing Regressor, total=   0.1s


In [316]:
y_pred_pipeline2 = pipeline2.predict(x_train)

In [317]:
print(f'Train MAE:\n\t{mean_absolute_error(y_train, y_pred_pipeline2)}')

Train MAE:
	0.6937416107382557


In [319]:
transformer.named_transformers_

{'Numerical': Pipeline(steps=[('Imputer', SimpleImputer()), ('Scaler', StandardScaler())]),
 'Categorical': Pipeline(steps=[('OHE', OneHotEncoder(handle_unknown='ignore'))])}