<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Modelling-with-Polynomial-Features-and-Select-Kbest" data-toc-modified-id="Modelling-with-Polynomial-Features-and-Select-Kbest-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Modelling with Polynomial Features and Select Kbest</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Useful-Scripts" data-toc-modified-id="Useful-Scripts-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Useful Scripts</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Select-features-from-cleaned-data" data-toc-modified-id="Select-features-from-cleaned-data-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Select features from cleaned data</a></span></li><li><span><a href="#Add-polynomial-interaction-features" data-toc-modified-id="Add-polynomial-interaction-features-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Add polynomial interaction features</a></span></li><li><span><a href="#Train-Test-split" data-toc-modified-id="Train-Test-split-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Train Test split</a></span></li><li><span><a href="#Select-k-best-features" data-toc-modified-id="Select-k-best-features-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Select k best features</a></span></li><li><span><a href="#Regression-Modelling" data-toc-modified-id="Regression-Modelling-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Regression Modelling</a></span></li><li><span><a href="#Single-Script" data-toc-modified-id="Single-Script-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Single Script</a></span></li></ul></div>

# Modelling with Polynomial Features and Select Kbest

# Imports

In [42]:
import numpy as np
import pandas as pd

import os
import time
import collections
import itertools

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# random state
random_state=100
np.random.seed=random_state
np.random.set_state=random_state

# Useful Scripts

In [27]:
def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)

# Load the data

In [3]:
df = pd.read_csv('../data/processed/data_cleaned_encoded.csv')

print(df.shape)
df.head(2)

(21613, 92)


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,age_after_renovation_cat_7,age_after_renovation_cat_8,age_after_renovation_cat_9,log1p_price,log1p_sqft_living,log1p_sqft_lot,log1p_sqft_above,log1p_sqft_basement,log1p_sqft_living15,log1p_sqft_lot15
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,0,0,12.309987,7.074117,8.639588,7.074117,0.0,7.201171,8.639588
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,0,0,0,13.195616,7.85205,8.887791,7.682943,5.993961,7.433075,8.941153


# Select features from cleaned data

In [23]:
print(df.columns.to_numpy())

['id' 'date' 'price' 'bedrooms' 'bathrooms' 'sqft_living' 'sqft_lot'
 'floors' 'waterfront' 'view' 'condition' 'grade' 'sqft_above'
 'sqft_basement' 'yr_built' 'yr_renovated' 'zipcode' 'lat' 'long'
 'sqft_living15' 'sqft_lot15' 'yr_sales' 'age' 'yr_renovated2'
 'age_after_renovation' 'zipcode_top10' 'zipcode_houses' 'basement_bool'
 'renovation_bool' 'age_cat' 'age_after_renovation_cat' 'waterfront_0'
 'waterfront_1' 'view_0' 'view_1' 'view_2' 'view_3' 'view_4' 'condition_1'
 'condition_2' 'condition_3' 'condition_4' 'condition_5' 'grade_1'
 'grade_10' 'grade_11' 'grade_12' 'grade_13' 'grade_3' 'grade_4' 'grade_5'
 'grade_6' 'grade_7' 'grade_8' 'grade_9' 'zipcode_top10_98004'
 'zipcode_top10_98006' 'zipcode_top10_98033' 'zipcode_top10_98039'
 'zipcode_top10_98040' 'zipcode_top10_98102' 'zipcode_top10_98105'
 'zipcode_top10_98155' 'zipcode_top10_98177' 'zipcode_top10_others'
 'age_cat_0' 'age_cat_1' 'age_cat_2' 'age_cat_3' 'age_cat_4' 'age_cat_5'
 'age_cat_6' 'age_cat_7' 'age_cat_8' 'ag

In [28]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'basement_bool', 'renovation_bool', 'zipcode_houses', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004', 'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039', 'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105', 'zipcode_top10_98155', 'zipcode_top10_98177', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9']

print(features)

['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'basement_bool', 'renovation_bool', 'zipcode_houses', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004', 'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039', 'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105', 'zipcode_top10_98155', 'zipcode_top10_98177', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_

In [29]:
target = ['price']
df_selected = df[features + target]
df_selected.head(2)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,...,age_after_renovation_cat_1,age_after_renovation_cat_2,age_after_renovation_cat_3,age_after_renovation_cat_4,age_after_renovation_cat_5,age_after_renovation_cat_6,age_after_renovation_cat_7,age_after_renovation_cat_8,age_after_renovation_cat_9,price
0,3,1.0,1180,5650,1180,0,1955,0,47.5112,-122.257,...,0,0,0,0,1,0,0,0,0,221900.0
1,3,2.25,2570,7242,2170,400,1951,1991,47.721,-122.319,...,0,1,0,0,0,0,0,0,0,538000.0


# Add polynomial interaction features

In [30]:
def add_interactions(df):
    from itertools import combinations
    from sklearn.preprocessing import PolynomialFeatures

    # Get feature names
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns) + ['_'.join(x) for x in combos]
    
    # Find interactions
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = colnames
    
    # Remove interaction terms with all 0 values            
    noint_indicies = [i for i, x in enumerate(list((df == 0).all())) if x]
    df = df.drop(df.columns[noint_indicies], axis=1)
    
    return df

In [31]:
df_large = add_interactions(df_selected)
print(df_large.shape)
df_large.head()

(21613, 2047)


Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,...,age_after_renovation_cat_0_price,age_after_renovation_cat_1_price,age_after_renovation_cat_2_price,age_after_renovation_cat_3_price,age_after_renovation_cat_4_price,age_after_renovation_cat_5_price,age_after_renovation_cat_6_price,age_after_renovation_cat_7_price,age_after_renovation_cat_8_price,age_after_renovation_cat_9_price
0,3.0,1.0,1180.0,5650.0,1180.0,0.0,1955.0,0.0,47.5112,-122.257,...,0.0,0.0,0.0,0.0,0.0,221900.0,0.0,0.0,0.0,0.0
1,3.0,2.25,2570.0,7242.0,2170.0,400.0,1951.0,1991.0,47.721,-122.319,...,0.0,0.0,538000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,1.0,770.0,10000.0,770.0,0.0,1933.0,0.0,47.7379,-122.233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180000.0,0.0,0.0
3,4.0,3.0,1960.0,5000.0,1050.0,910.0,1965.0,0.0,47.5208,-122.393,...,0.0,0.0,0.0,0.0,604000.0,0.0,0.0,0.0,0.0,0.0
4,3.0,2.0,1680.0,8080.0,1680.0,0.0,1987.0,0.0,47.6168,-122.045,...,0.0,0.0,510000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train Test split

In [7]:
# train test split
df_train_large, df_test_large = train_test_split(df_large, test_size=0.2,
                                                 random_state=random_state)
print(df_train_large.shape)
df_train_large.head(2)

(17290, 2047)


Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,...,age_after_renovation_cat_0_price,age_after_renovation_cat_1_price,age_after_renovation_cat_2_price,age_after_renovation_cat_3_price,age_after_renovation_cat_4_price,age_after_renovation_cat_5_price,age_after_renovation_cat_6_price,age_after_renovation_cat_7_price,age_after_renovation_cat_8_price,age_after_renovation_cat_9_price
16000,3.0,1.75,1780.0,11096.0,1210.0,570.0,1979.0,0.0,47.617,-122.051,...,0.0,0.0,0.0,325000.0,0.0,0.0,0.0,0.0,0.0,0.0
11286,2.0,2.5,1420.0,2229.0,1420.0,0.0,2004.0,0.0,47.4871,-122.165,...,278000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df_Xtrain_large = df_train_large.drop(target,axis=1)
df_ytrain_large = df_train_large[target]

df_Xtest_large = df_test_large.drop(target,axis=1)
df_ytest_large = df_test_large[target]

df_ytrain_large.head()

Unnamed: 0,price
16000,325000.0
11286,278000.0
3201,710000.0
11049,389900.0
9716,489000.0


In [32]:
# numpy arrays
Xtrain_large = df_Xtrain_large.to_numpy()
ytrain_large = df_ytrain_large.to_numpy().ravel()

Xtrain_large.shape, ytrain_large.shape

((17290, 2046), (17290,))

# Select k best features

In [41]:
import sklearn.feature_selection

k = 300

model_kbest = sklearn.feature_selection.SelectKBest(k=k)
model_kbest = model_kbest.fit(Xtrain_large, ytrain_large)

idx_kbest = model_kbest.get_support(indices=True)

cols_kbest = [df_train_large.columns[i] for i in idx_kbest]

Xtrain_kbest = df_Xtrain_large[cols_kbest].to_numpy()
ytrain_kbest = df_ytrain_large.to_numpy().ravel()

Xtest_kbest = df_Xtest_large[cols_kbest].to_numpy()
ytest_kbest = df_ytest_large.to_numpy().ravel()

Xtrain_kbest.shape, Xtest_kbest.shape

  993 1045 1087 1092 1104 1107 1111 1138 1181 1274 1326 1328 1353 1362
 1369 1377 1386 1389 1391 1414 1530 1531 1532 1596 1599 1600 1607 1614
 1619 1620 1621 1622 1626 1628 1630 1631 1634 1636 1638 1641 1653 1662
 1663 1812 1862 1868 1892 1897 1898 1999] are constant.
  f = msb / msw
  f = msb / msw


((17290, 300), (4323, 300))

# Regression Modelling

In [43]:
# model = RandomForestRegressor()
model = LinearRegression()

# fitting
model.fit(Xtrain_kbest,ytrain_kbest)

# prediction
ypreds = model.predict(Xtest_kbest)

from sklearn.metrics import r2_score

r2 = r2_score(ytest_kbest,ypreds)

ar2 = adjustedR2(r2,Xtest_kbest.shape[0],Xtest_kbest.shape[1])

print('R-squared Value = ', r2)
print('Adjusted R-squared Value = ', ar2)

R-squared Value =  0.9999625814755158
Adjusted R-squared Value =  0.9999598004318119


# Single Script

In [45]:
Xtrain_large.shape, ytrain_large.shape

((17290, 2046), (17290,))

In [40]:
import sklearn.feature_selection

# fit the select KBest on training data to get best features
k = 300
model_kbest = sklearn.feature_selection.SelectKBest(k=k)
model_kbest = model_kbest.fit(Xtrain_large, ytrain_large)

idx_kbest = model_kbest.get_support(indices=True)
cols_kbest = [df_train_large.columns[i] for i in idx_kbest]


# get numpy arrays using best features
Xtrain_kbest = df_Xtrain_large[cols_kbest].to_numpy()
ytrain_kbest = df_ytrain_large.to_numpy().ravel()

Xtest_kbest = df_Xtest_large[cols_kbest].to_numpy()
ytest_kbest = df_ytest_large.to_numpy().ravel()


# from sklearn.ensemble import RandomForestRegressor
# model = RandomForestRegressor()

from sklearn.linear_model import LinearRegression
model = LinearRegression()

# fitting
model.fit(Xtrain_kbest,ytrain_kbest)

# prediction
ypreds = model.predict(Xtest_kbest)

from sklearn.metrics import r2_score

r2 = r2_score(ytest_kbest,ypreds)

ar2 = adjustedR2(r2,Xtest_kbest.shape[0],Xtest_kbest.shape[1])

print('R-squared Value = ', r2)
print('Adjusted R-squared Value = ', ar2)

  993 1045 1087 1092 1104 1107 1111 1138 1181 1274 1326 1328 1353 1362
 1369 1377 1386 1389 1391 1414 1530 1531 1532 1596 1599 1600 1607 1614
 1619 1620 1621 1622 1626 1628 1630 1631 1634 1636 1638 1641 1653 1662
 1663 1812 1862 1868 1892 1897 1898 1999] are constant.
  f = msb / msw
  f = msb / msw


R-squared Value =  0.9999625814755158
Adjusted R-squared Value =  0.9999598004318119
