In [83]:
# learned about pipelines, one hot encoding, standardization
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv('./datasets/housing.csv', sep=',')
df.head()


df['income_cat'] = np.ceil(df['median_income'] / 1.5)
df['income_cat'].where( df['income_cat'] < 5.0, 5.0, inplace = True)

strat_train, strat_test = train_test_split(df, test_size = 0.2, random_state = 5, stratify = df['income_cat'])

strat_train.drop(labels= ['income_cat'], axis = 1, inplace = True)
strat_test.drop(labels= ['income_cat'], axis = 1, inplace = True)


In [84]:
housing = strat_train.drop(labels = ['median_house_value'], axis = 1)
housing_label = strat_train['median_house_value'].copy()

In [85]:
housing_label.head()

11397    273400.0
13036    121900.0
6055     310700.0
6754     280000.0
867      318300.0
Name: median_house_value, dtype: float64

In [86]:
# data cleaning and preprocessing

In [87]:
# filling the median for the missing values
# housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace = True)

imputer = SimpleImputer(strategy = 'median')
housing_num = housing.drop(labels= ['ocean_proximity'], axis = 1)
housing_num_tr = imputer.fit_transform(housing_num)

In [88]:
housing_num_tr = pd.DataFrame(housing_num_tr, columns = housing_num.columns)

In [91]:
housing_cat = housing[['ocean_proximity']]


housing_cat.head()

Unnamed: 0,ocean_proximity
11397,<1H OCEAN
13036,INLAND
6055,<1H OCEAN
6754,<1H OCEAN
867,NEAR BAY


In [92]:
oe = OrdinalEncoder()
housing_cat_encoded = oe.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[0.],
       [1.],
       [0.],
       [0.],
       [3.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.]])

In [93]:
one_hot = OneHotEncoder()

In [94]:
housing_cat = one_hot.fit_transform(housing_cat)

In [97]:
housing_cat.toarray()

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [99]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [100]:
our_pipeline = Pipeline([('imputer', imputer), ('standardization', StandardScaler(feature))])

In [101]:
housing_num_tr = pd.DataFrame(housing_num_tr, columns = housing_num.columns)

In [102]:
housing_num_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-117.97,33.72,24.0,2991.0,500.0,1437.0,453.0,5.4286
1,-121.17,38.68,37.0,1252.0,267.0,686.0,256.0,3.0
2,-117.86,33.99,10.0,17820.0,2812.0,8686.0,2666.0,6.3875
3,-118.1,34.1,52.0,1788.0,313.0,792.0,294.0,3.75
4,-122.07,37.57,8.0,8647.0,1407.0,5019.0,1379.0,6.5615


In [103]:
p  = our_pipeline.fit_transform(housing_num_tr)
type(p)

numpy.ndarray

In [104]:
p

array([[ 0.79913265, -0.89463125, -0.36660224, ...,  0.01092415,
        -0.11833661,  0.81248212],
       [-0.79794901,  1.42510216,  0.66239208, ..., -0.64252718,
        -0.63470484, -0.45736025],
       [ 0.85403233, -0.76835544, -1.47474996, ...,  6.31833917,
         5.68228719,  1.31386228],
       ...,
       [ 0.85403233, -0.75900167, -0.99982951, ...,  1.40396753,
         0.59986083,  0.6805097 ],
       [ 0.30503551, -0.10423821,  0.02916481, ...,  0.10228552,
         0.42162205, -0.48507237],
       [-1.37190023,  0.89193762,  0.42493185, ...,  0.15971267,
         0.41375857,  0.62885013]])

In [105]:
from sklearn.compose import ColumnTransformer

In [112]:
num_attribs = list(housing_num.columns)
cat_attribs = ['ocean_proximity']


In [113]:
full_pipeline = ColumnTransformer([('num', our_pipeline, num_attribs), ('cat', OneHotEncoder(), cat_attribs)])

In [116]:
train = full_pipeline.fit_transform(housing)
train.shape

(16512, 13)

In [117]:
print(housing.shape)

(16512, 9)
