In this part we're going to exploit Pipeline, a very useful class to automate transformations

In [77]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns

df = pd.read_csv('housing.csv')
label = df['median_house_value']
df = df.drop(['median_house_value'],axis=1)

%matplotlib inline

In [78]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY


In the previous section we got to add some numerical features. I know that this is much harder than that method, but we can automate a lot using this

In [52]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self,add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        rooms_per_household = X[:,population_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                       bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [53]:
df_cat = df['ocean_proximity']
df_num = df.drop(['ocean_proximity'],axis=1)

In [54]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self,features):
        self.features = features
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.features].values

In [55]:
cat_feat = list(df_cat)
num_feat = list(df_num)

['ocean_proximity']

In [56]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


num_pipeline = Pipeline([
        ('selector', Selector(num_feat)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', Selector(cat_feat)),
        ('one_hot', OneHotEncoder()),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])



In [57]:
df_transformed = full_pipeline.fit_transform(df)

# Training

In [79]:
#Split between training set and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_transformed,label,test_size=0.33,random_state = 69)

In [81]:
def display_scores(score):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation", scores.std())

Now we're going to compare different models and see which performs the best without much tuning. 

In [84]:
from sklearn.model_selection import cross_val_score

#decision tree
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
scores = cross_val_score(tree_reg, X_train, y_train, scoring="neg_mean_squared_error",
                        cv=10)
display_scores(np.sqrt(-scores))

Scores: [-5.32590528e+09 -4.67295561e+09 -4.63813382e+09 -4.60787419e+09
 -5.18206508e+09 -5.19647035e+09 -4.91119775e+09 -4.88413494e+09
 -5.84548380e+09 -5.20510623e+09]
Mean: -5046932705.741349
Standard deviation 364118873.31106806


In [83]:
#linear regression

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
scores = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error",
                        cv=10)
display_scores(np.sqrt(-scores))

Scores: [-4.43957578e+09 -4.81813025e+09 -5.16413347e+09 -4.31514990e+09
 -4.81150362e+09 -4.44398890e+09 -4.27968530e+09 -4.71693839e+09
 -4.60917347e+09 -4.26618025e+09]
Mean: -4586445933.887878
Standard deviation 277080292.1441019


In [85]:
#random forest regression

from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
scores = cross_val_score(rf_reg, X_train, y_train, scoring="neg_mean_squared_error",
                        cv=10)
display_scores(np.sqrt(-scores))



Scores: [-2.87863414e+09 -2.90490382e+09 -2.97708539e+09 -2.53815375e+09
 -2.73040782e+09 -2.75751980e+09 -2.60691759e+09 -2.78624946e+09
 -3.20468389e+09 -2.63690989e+09]
Mean: -2802146554.784707
Standard deviation 187496593.43304265


Random forest looks promising. Lets tweak it

In [88]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3,10,3], 'max_features':[2,4,6,8]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]},
]

rf_reg = RandomForestRegressor()
grid_search = GridSearchCV(rf_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train,y_train)
grid_search.best_estimator_



RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)