In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Problem 1: Predicting House Prices

In [21]:
# load the data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/kc_house_data.csv'
sales = pd.read_csv(url, index_col='id')
sales.head()

Unnamed: 0_level_0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


This dataset contains house sale prices for King County, which includes Seattle. 
It includes homes sold between May 2014 and May 2015.

There are 21 columns:

| Column | Description |
| :- | -: |
| id | Unique ID for each home sold |
| date | Date of the home sale |
| price | Price of each home sold |
| bedrooms | Number of bedrooms |
| bathrooms | Number of bathrooms, where .5 accounts for a room with a toilet but no shower |
| sqft_living | Square footage of the apartments interior living space |
| sqft_lot | Square footage of the land space |
| floors | Number of floors |
| waterfront | A dummy variable for whether the apartment was overlooking the waterfront or not |
| view | An index from 0 to 4 of how good the view of the property was |
| condition | An index from 1 to 5 on the condition of the apartment |
| grade | An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design |
| sqft_above | The square footage of the interior housing space hat is above ground level |
| sqft_basement | The square footage of the interior housing space that is below ground level |
| yr_built | The year the house was initially built |
| yr_renovated | The year of the house's last renovation |
| zipcode | What zipcode area the house is in |
| lat | Latitude |
| long | Longitude |
| sqft_living15 | The square footage of interior housing living space for the nearest 15 neighbors |
| sqft_lot15 | The square footage of the land lots of the nearest 15 neighbors |

In [22]:
sales.dtypes

date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [23]:
sales_numeric_cols = sales.select_dtypes(include=[np.number]).columns
# drop lat and long - should be irrelevant
sales_numeric_cols = sales_numeric_cols.drop(['lat','long', 'price'])
sales = sales.drop(['lat','long'],axis=1)

Your **goal** is to **train** a regression pipeline that predicts house prices, **tune** the pipeline hyperparameters, and **test** its performance.

In [24]:
# import model
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
# pipeline
from sklearn.pipeline import Pipeline
# preprocessing
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# metrics
from sklearn.metrics import accuracy_score, mean_squared_error

In [25]:
# split data into training and testing sets
X = sales.drop('price', axis=1)
y = sales['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [26]:
# create pipeline
numeric_features = sales_numeric_cols
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),])

pipe_model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('poly', PolynomialFeatures(degree=1)),
                        ('reg', RandomForestRegressor(n_estimators=100, random_state=42))])

# fit the model
pipe_model.fit(X_train, y_train)

# predict
y_pred = pipe_model.predict(X_test)

# evaluate the model
print('RMSE', mean_squared_error(y_test, y_pred, squared=False))

# accuracy
print('Accuracy:', pipe_model.score(X_test, y_test))

RMSE 177890.30711952999
Accuracy: 0.7884520092408993


In [31]:
numeric_features.shape

(16,)

In [28]:
sales_numeric_colss = sales.select_dtypes(include=[np.number]).columns

In [43]:
# feature importance
feature_importances = pipe_model.named_steps['reg'].feature_importances_
# remove price from the list
feature_importances_df = pd.DataFrame({'feature':sales_numeric_colss,'importance':feature_importances})
feature_importances_df.sort_values(by='importance', ascending=False)

ValueError: All arrays must be of the same length

In [None]:
top11_features = feature_importances_df.sort_values(by='importance', ascending=False).head(11)['feature'].values

In [42]:
# retrain the model with the top 11 features
X = sales[top11_features]
y = sales['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# create pipeline
numeric_features = top11_features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),])

pipe_model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('poly', PolynomialFeatures(degree=1)),
                        ('reg', RandomForestRegressor(n_estimators=100, random_state=42))])

# fit the model
pipe_model.fit(X_train, y_train)

# predict
y_pred = pipe_model.predict(X_test)

# evaluate the model
print('RMSE', mean_squared_error(y_test, y_pred, squared=False))

# accuracy
print('Accuracy:', pipe_model.score(X_test, y_test))

# feature importance
feature_importances = pipe_model.named_steps['reg'].feature_importances_

feature_importances_df = pd.DataFrame({'feature':top11_features,'importance':feature_importances[1:12]})
feature_importances_df.sort_values(by='importance', ascending=False)

RMSE 177031.94608263616
Accuracy: 0.7904886177816295


Unnamed: 0,feature,importance
0,grade,0.372908
1,sqft_living,0.25274
2,yr_built,0.092567
3,zipcode,0.086627
4,sqft_living15,0.05208
5,sqft_lot15,0.028963
6,sqft_above,0.027333
7,sqft_lot,0.025126
9,bathrooms,0.021437
8,waterfront,0.020805


In [30]:
# # define parameter grid
# param_grid = {
#     'poly__degree': [1, 2],
#     'reg__alpha': [0.001, 0.01, 0.1, 1, 10]
# }

# # create grid search
# grid = GridSearchCV(pipe_model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error') # scoring='neg_mean_absolute_error', we want to minimize the MSE, so negative MAE

# # fit grid search
# grid.fit(X_train, y_train)

# # best parameters
# print(grid.best_params_)

# # best model
# best_model = grid.best_estimator_

**Plot** the observed prices against the predicted prices.