In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
#Read the file housing.csv from folder datasets
df = pd.read_csv('datasets/housing.csv')

In [3]:
# display the df head
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
# display df info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [6]:
# very few houses on island. They won't make much difference in predictions. 
#We may well remove these rows without any significsant effect on results.
df = df[df["ocean_proximity"] != 'ISLAND']

In [7]:
X = df

## Total bedrooms has missing values
### Three ways to Deal with Missing Values
#### 1. Drop the rows containing missing values
#### 2. Drop the whole column that has missing values
#### 3. Impute (estimate) the values with mean, median, mode or some other statistic of the column

In [8]:
#X.dropna(subset=["total_bedrooms"],inplace=True) # option 1

In [9]:
len(X)

20635

In [10]:
y = df['median_house_value']
X.drop(['median_house_value'],axis=1,inplace=True)

In [11]:
#X.drop("total_bedrooms", axis=1,inplace=True) # option 2

In [12]:
# split into train and valid sets
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2,random_state=100)

In [13]:
len(X_train),len(y_train),len(X_valid),len(y_valid)

(16508, 16508, 4127, 4127)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.1,random_state=100)

In [15]:
len(X_train),len(y_train),len(X_test),len(y_test)

(14857, 14857, 1651, 1651)

In [16]:
X_train['ocean_proximity'].value_counts()

<1H OCEAN     6567
INLAND        4644
NEAR OCEAN    1967
NEAR BAY      1679
Name: ocean_proximity, dtype: int64

In [17]:
X_valid['ocean_proximity'].value_counts()

<1H OCEAN     1833
INLAND        1367
NEAR OCEAN     493
NEAR BAY       434
Name: ocean_proximity, dtype: int64

In [18]:
X_test['ocean_proximity'].value_counts()

<1H OCEAN     736
INLAND        540
NEAR OCEAN    198
NEAR BAY      177
Name: ocean_proximity, dtype: int64

In [19]:
X_train = pd.get_dummies(X_train)

In [20]:
X_valid = pd.get_dummies(X_valid)

In [21]:
X_test = pd.get_dummies(X_test)

In [22]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
14813,-117.17,32.69,45.0,3168.0,598.0,1341.0,562.0,4.5189,0,0,0,1
2757,-115.5,32.67,35.0,2159.0,492.0,1694.0,475.0,2.1776,0,1,0,0
3016,-118.93,34.82,8.0,508.0,111.0,229.0,84.0,4.0332,0,1,0,0
12703,-121.41,38.59,17.0,12355.0,3630.0,5692.0,3073.0,2.5245,0,1,0,0
5499,-118.42,34.0,14.0,3771.0,1180.0,2355.0,978.0,3.1603,1,0,0,0


In [23]:
#option 3
#one way to do that is like this
#median = df["total_bedrooms"].median()
#X_train["total_bedrooms"].fillna(median)

#However, using an Imputer is more flexible
def impute_data(X_train,X_valid,X_test,strategy='median'):
    my_imputer = Imputer(strategy=strategy)
    X_train_imputed = pd.DataFrame(my_imputer.fit_transform(X_train))
    X_valid_imputed = pd.DataFrame(my_imputer.transform(X_valid))
    X_train_imputed.columns = X_train.columns
    X_valid_imputed.columns = X_valid.columns
    if X_test is not None:
        X_test_imputed = pd.DataFrame(my_imputer.transform(X_test))
        X_test_imputed.columns = X_test.columns
        
    return X_train_imputed,X_valid_imputed,X_test_imputed


In [24]:
X_train, X_valid, X_test = impute_data(X_train,X_valid,X_test)



In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14857 entries, 0 to 14856
Data columns (total 12 columns):
longitude                     14857 non-null float64
latitude                      14857 non-null float64
housing_median_age            14857 non-null float64
total_rooms                   14857 non-null float64
total_bedrooms                14857 non-null float64
population                    14857 non-null float64
households                    14857 non-null float64
median_income                 14857 non-null float64
ocean_proximity_<1H OCEAN     14857 non-null float64
ocean_proximity_INLAND        14857 non-null float64
ocean_proximity_NEAR BAY      14857 non-null float64
ocean_proximity_NEAR OCEAN    14857 non-null float64
dtypes: float64(12)
memory usage: 1.4 MB


In [26]:
X_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4127 entries, 0 to 4126
Data columns (total 12 columns):
longitude                     4127 non-null float64
latitude                      4127 non-null float64
housing_median_age            4127 non-null float64
total_rooms                   4127 non-null float64
total_bedrooms                4127 non-null float64
population                    4127 non-null float64
households                    4127 non-null float64
median_income                 4127 non-null float64
ocean_proximity_<1H OCEAN     4127 non-null float64
ocean_proximity_INLAND        4127 non-null float64
ocean_proximity_NEAR BAY      4127 non-null float64
ocean_proximity_NEAR OCEAN    4127 non-null float64
dtypes: float64(12)
memory usage: 387.0 KB


In [27]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1651 entries, 0 to 1650
Data columns (total 12 columns):
longitude                     1651 non-null float64
latitude                      1651 non-null float64
housing_median_age            1651 non-null float64
total_rooms                   1651 non-null float64
total_bedrooms                1651 non-null float64
population                    1651 non-null float64
households                    1651 non-null float64
median_income                 1651 non-null float64
ocean_proximity_<1H OCEAN     1651 non-null float64
ocean_proximity_INLAND        1651 non-null float64
ocean_proximity_NEAR BAY      1651 non-null float64
ocean_proximity_NEAR OCEAN    1651 non-null float64
dtypes: float64(12)
memory usage: 154.9 KB


In [29]:
y_train.isnull().sum(),y_train.isna().sum()

(0, 0)

In [30]:
y_valid.isnull().sum(),y_train.isna().sum()

(0, 0)

In [31]:
y_test.isnull().sum(),y_train.isna().sum()

(0, 0)

In [32]:
def regression_score(model,X_train, X_valid, y_train, y_valid,metric=mean_absolute_error):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return metric(y_valid, preds)

In [33]:
dt_reg = DecisionTreeRegressor()
regression_score(dt_reg,X_train,X_valid,y_train,y_valid)

44070.50739035619

In [34]:
lin_reg = LinearRegression(n_jobs=-1)
regression_score(lin_reg,X_train,X_valid,y_train,y_valid)

49425.44545673858

In [35]:
knn_reg = KNeighborsRegressor(n_neighbors=5)
regression_score(knn_reg,X_train,X_valid,y_train,y_valid)

77770.88286891204

In [36]:
rf_reg = RandomForestRegressor(n_estimators=100)
regression_score(rf_reg,X_train,X_valid,y_train,y_valid)

32126.958992003878

In [None]:
DecisionTreeReg