<a href="https://colab.research.google.com/github/alishermutalov/ML-learning/blob/ml/Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [4]:
df = pd.read_csv('https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true')
df.head()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
housing = train_set.drop('median_house_value', axis=1)
housing_labels = train_set['median_house_value'].copy()
housing_num = housing.drop('ocean_proximity', axis=1)

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household]


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

num_pipeline.fit_transform(housing_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.326196  ,
        -0.17491646, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.03584338,
        -0.40283542,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.14470145,
         0.08821601, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.49697313,
        -0.60675918,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.96545045,
         0.40217517, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.68544764,
        -0.85144571,  1.69520292]])

In [7]:
from sklearn.compose import ColumnTransformer

num_attributes = list(housing_num)
cat_attributes = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', OneHotEncoder(), cat_attributes)
])

In [9]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [10]:
housing_prepared[0:5, :]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646, -0.2117846 ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542,  0.34218528,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.66165785,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.78303162,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ],
       [-0.10855122,  0.5320839 ,  1.14200767, -0.11956547, -0.25655915,
        -0.48587717, -0.31496232, -0.17148831, 