In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Problem 1: Predicting House Prices

In [None]:
# load the data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/kc_house_data.csv'
sales = pd.read_csv(url, index_col='id')
sales.head()

This dataset contains house sale prices for King County, which includes Seattle. 
It includes homes sold between May 2014 and May 2015.

There are 21 columns:

| Column | Description |
| :- | -: |
| id | Unique ID for each home sold |
| date | Date of the home sale |
| price | Price of each home sold |
| bedrooms | Number of bedrooms |
| bathrooms | Number of bathrooms, where .5 accounts for a room with a toilet but no shower |
| sqft_living | Square footage of the apartments interior living space |
| sqft_lot | Square footage of the land space |
| floors | Number of floors |
| waterfront | A dummy variable for whether the apartment was overlooking the waterfront or not |
| view | An index from 0 to 4 of how good the view of the property was |
| condition | An index from 1 to 5 on the condition of the apartment |
| grade | An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design |
| sqft_above | The square footage of the interior housing space hat is above ground level |
| sqft_basement | The square footage of the interior housing space that is below ground level |
| yr_built | The year the house was initially built |
| yr_renovated | The year of the house's last renovation |
| zipcode | What zipcode area the house is in |
| lat | Latitude |
| long | Longitude |
| sqft_living15 | The square footage of interior housing living space for the nearest 15 neighbors |
| sqft_lot15 | The square footage of the land lots of the nearest 15 neighbors |

In [None]:
sales.dtypes

In [None]:
sales_numeric_cols = sales.select_dtypes(include=[np.number]).columns
# drop lat and long - should be irrelevant
sales_numeric_cols = sales_numeric_cols.drop(['lat','long', 'price'])
sales = sales.drop(['lat','long'],axis=1)

Your **goal** is to **train** a regression pipeline that predicts house prices, **tune** the pipeline hyperparameters, and **test** its performance.

In [None]:
# import model
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
# pipeline
from sklearn.pipeline import Pipeline
# preprocessing
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# metrics
from sklearn.metrics import accuracy_score, mean_squared_error

In [None]:
# split data into training and testing sets
X = sales.drop('price', axis=1)
y = sales['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [21]:
# create pipeline
numeric_features = sales_numeric_cols
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),])

pipe_model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('poly', PolynomialFeatures(degree=1)),
                        ('reg', RandomForestRegressor(n_estimators=100, random_state=42))])

# fit the model
pipe_model.fit(X_train, y_train)

# predict
y_pred = pipe_model.predict(X_test)

# evaluate the model
print('RMSE', mean_squared_error(y_test, y_pred, squared=False))

# accuracy
print('Accuracy:', pipe_model.score(X_test, y_test))

KeyboardInterrupt: 

In [None]:
numeric_features.shape

In [None]:
sales_numeric_colss = sales.select_dtypes(include=[np.number]).columns

In [None]:
# feature importance
feature_importances = pipe_model.named_steps['reg'].feature_importances_
# remove price from the list
feature_importances_df = pd.DataFrame({'feature':sales_numeric_colss,'importance':feature_importances})
feature_importances_df.sort_values(by='importance', ascending=False)

In [None]:
top11_features = feature_importances_df.sort_values(by='importance', ascending=False).head(11)['feature'].values

In [None]:
# retrain the model with the top 11 features
X = sales[top11_features]
y = sales['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# create pipeline
numeric_features = top11_features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),])

pipe_model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('poly', PolynomialFeatures(degree=1)),
                        ('reg', RandomForestRegressor(n_estimators=100, random_state=42))])

# fit the model
pipe_model.fit(X_train, y_train)

# predict
y_pred = pipe_model.predict(X_test)

# evaluate the model
print('RMSE', mean_squared_error(y_test, y_pred, squared=False))

# accuracy
print('Accuracy:', pipe_model.score(X_test, y_test))

# feature importance
feature_importances = pipe_model.named_steps['reg'].feature_importances_

feature_importances_df = pd.DataFrame({'feature':top11_features,'importance':feature_importances[1:12]})
feature_importances_df.sort_values(by='importance', ascending=False)

In [None]:
# define parameter grid for random forest
param_grid = {
    'poly__degree': [1, 2],
    'reg__n_estimators': [100, 200],
    'reg__max_depth': [2, 3, 4, 5, 10],
    'reg__min_samples_split': [2, 3, 4, 5],
    'reg__min_samples_leaf': [1, 2, 3, 4, 5],
}

# create grid search for random forest
grid = GridSearchCV(pipe_model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

# fit grid search
grid.fit(X_train, y_train)

# best parameters
print(grid.best_params_)

# best model
best_model = grid.best_estimator_

**Plot** the observed prices against the predicted prices.