In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import pickle

In [None]:
#Read the file housing.csv from folder datasets
df = pd.read_csv('datasets/housing.csv')

In [None]:
# display the df head
df.head()

In [None]:
# display df info
df.info()

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
# very few houses on island. They won't make much difference in predictions. 
#We may well remove these rows without any significsant effect on results.
df = df[df["ocean_proximity"] != 'ISLAND']

In [None]:
X = df

## Total bedrooms has missing values
### Three ways to Deal with Missing Values
#### 1. Drop the rows containing missing values
#### 2. Drop the whole column that has missing values
#### 3. Impute (estimate) the values with mean, median, mode or some other statistic of the column

In [None]:
#X.dropna(subset=["total_bedrooms"],inplace=True) # option 1

In [None]:
y = df['median_house_value']
X.drop(['median_house_value'],axis=1,inplace=True)

In [None]:
#X.drop("total_bedrooms", axis=1,inplace=True) # option 2

In [None]:
# split into train and valid sets
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2,random_state=100)

In [None]:
len(X_train),len(y_train),len(X_valid),len(y_valid)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.1,random_state=100)

In [None]:
len(X_train),len(y_train),len(X_test),len(y_test)

In [None]:
X_train['ocean_proximity'].value_counts()

In [None]:
X_valid['ocean_proximity'].value_counts()

In [None]:
X_test['ocean_proximity'].value_counts()

In [None]:
X_train = pd.get_dummies(X_train)

In [None]:
X_valid = pd.get_dummies(X_valid)

In [None]:
X_test = pd.get_dummies(X_test)

In [None]:
X_train.head()

In [None]:
#option 3
#one way to do that is like this
#median = df["total_bedrooms"].median()
#X_train["total_bedrooms"].fillna(median)

#However, using an Imputer is more flexible
def impute_data(X_train,X_valid,X_test,strategy='median'):
    my_imputer = Imputer(strategy=strategy)
    X_train_imputed = pd.DataFrame(my_imputer.fit_transform(X_train))
    X_valid_imputed = pd.DataFrame(my_imputer.transform(X_valid))
    X_train_imputed.columns = X_train.columns
    X_valid_imputed.columns = X_valid.columns
    if X_test is not None:
        X_test_imputed = pd.DataFrame(my_imputer.transform(X_test))
        X_test_imputed.columns = X_test.columns
        
    return X_train_imputed,X_valid_imputed,X_test_imputed


In [None]:
#X_train, X_valid, X_test = impute_data(X_train,X_valid,X_test)

In [None]:
X_train.info()

In [None]:
X_valid.info()

In [None]:
X_test.info()

In [None]:
y_train.isnull().sum(),y_train.isna().sum()

In [None]:
y_valid.isnull().sum(),y_train.isna().sum()

In [None]:
y_test.isnull().sum(),y_train.isna().sum()

In [None]:
def regression_score(model,X_train, X_valid, y_train, y_valid,metric=mean_absolute_error):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return metric(y_valid, preds)

In [None]:
dt_reg = DecisionTreeRegressor()
regression_score(dt_reg,X_train,X_valid,y_train,y_valid)

In [None]:
lin_reg = LinearRegression(n_jobs=-1)
regression_score(lin_reg,X_train,X_valid,y_train,y_valid)

In [None]:
knn_reg = KNeighborsRegressor(n_neighbors=5)
regression_score(knn_reg,X_train,X_valid,y_train,y_valid)

In [None]:
rf_reg = RandomForestRegressor()
regression_score(rf_reg,X_train,X_valid,y_train,y_valid)