In [2]:
#Import Initial Libraries and Data Set
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('data/kc_house_data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  float64
 9   view           21534 non-null  float64
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

In [4]:
#Convert NA values in 'waterfront' to 0

data['view'] = data['view'].replace(np.nan, 0)

#Convert NA values in 'waterfront' to 0

data['waterfront'] = data['waterfront'].replace(np.nan, 0)

#Convert NA values in 'yr_renovated' to 0

data['yr_renovated'] = data['yr_renovated'].replace(np.nan, 0)

In [5]:
data.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     21597 non-null  float64
 9   view           21597 non-null  float64
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   21597 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

In [7]:
#Convert zipcode to a str

data['zipcode_str'] = data['zipcode'].apply(lambda x: str(x))

In [8]:
#Restructure yr_built into a continuous variable called 'age'

data['age'] = data['yr_built'].apply(lambda x: 2021 - x)

In [9]:
#create 'ever_refurbished' to reflect whether or not a house has even been refurbished

data['refurbished'] = np.where((data['yr_renovated'] > 0),"Yes","No")

In [10]:
#Establish new Cont and Cat data

cont_data = ['price',
 'sqft_living',
 'sqft_lot',
 'grade',
 'sqft_above',
 'sqft_living15',
 'sqft_lot15',
 'age']

cat_data = [
 'bedrooms',
 'bathrooms',
 'floors',
 'waterfront',
 'condition',
 'zipcode_str',
 'refurbished', 'view']

In [11]:
#Recreate Features Data Set Model 5

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

#One Hot (So Hawt) Encode

data_ohe = pd.get_dummies(data[cat_data])

#Create Cont Dataset

data_cont = data[cont_data]

# #Log Data

logged_features = []

for item in list(data_cont.columns):
    data_cont[f'{item}_log'] = np.log(data_cont[item])
    logged_features.append(f'{item}_log')
    
scaled_features = []
for item in logged_features:
    data_cont[f'{item}_scaled'] = StandardScaler().fit_transform(data_cont[item].values.reshape(-1, 1))
    scaled_features.append(f'{item}_scaled')

#Create Features DataFrame
    
features_df = pd.concat([data_cont[scaled_features], data_ohe], axis = 1)

#Establish X and y

X = features_df
X = X.drop(['price_log_scaled'], axis = 1)
y = features_df['price_log_scaled']

#Instantiate Linear Regression and execute Cross Val Score for Model Validation

linreg = LinearRegression()

linreg.fit(X, y)

linreg.coef_

array([ 2.17925043e-01,  1.37093240e-01,  1.81110116e-01,  1.39407756e-01,
        1.00682102e-01, -2.77627237e-02, -3.43461166e-02, -2.73509983e-02,
        6.67635369e-02, -6.65372304e-02,  9.26980351e-01,  1.11390282e-01,
        1.25592154e-01, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -1.08538967e+10, -1.08538967e+10, -1.08538967e+10,
       -1.08538967e+10, -

In [11]:
#Create Polynomial Dataset

import numpy as np
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

#Create a copy of X

X2 = X.copy()

#take out zipcodes to lessen iteration

non_zipcode_features = []

for item in X2.columns:
    if 'zip' not in item:
        non_zipcode_features.append(item)

#Create polynomial features

X2_poly = poly.fit_transform(X2[non_zipcode_features])

#Get polynomial column names

X2_poly_column_names = poly.get_feature_names(input_features = X.columns)

#create polynomial features Data Frame

X2_poly_df = pd.DataFrame(X2_poly)

#add column names to polynomial features Data Frame

X2_poly_df.columns = list(X2_poly_column_names)

#Isolate New Features

new_features = []

for item in X2_poly_df.columns:
    if item not in X.columns:
        new_features.append(item)

In [12]:
#Locate Useful Features from Polynomial Data Set

useful_features = []
useful_scores = []

for item in new_features:
    
    x_improve = pd.concat([X, X2_poly_df[item]], axis = 1)

    #Instantiate Linear Regression and execute Cross Val Score for Model Validation to establish baseline
    
    new_score = np.mean(cross_val_score(linreg, x_improve, y, cv = 10, scoring='r2'))

    if new_score > baseline:
        useful_features.append(item)
        useful_scores.append(new_score)
    
    print(item, new_score)


1 0.8790571010201799
sqft_living_log_scaled^2 0.8811306274109677
sqft_living_log_scaled sqft_lot_log_scaled 0.8792222080721113
sqft_living_log_scaled grade_log_scaled 0.8827146764907257
sqft_living_log_scaled sqft_above_log_scaled 0.8815795688229325
sqft_living_log_scaled sqft_living15_log_scaled 0.8800274931651556
sqft_living_log_scaled sqft_lot15_log_scaled 0.8791590219492511
sqft_living_log_scaled age_log_scaled 0.8823515652215642
sqft_living_log_scaled bedrooms 0.8796140058586351
sqft_living_log_scaled bathrooms 0.8813605812339199
sqft_living_log_scaled floors 0.8819768798491376
sqft_living_log_scaled waterfront 0.8791123510859189
sqft_living_log_scaled condition 0.8794448962620922
sqft_living_log_scaled view 0.8790420552016783
sqft_living_log_scaled zipcode_str_98001 0.87912395437923
sqft_living_log_scaled zipcode_str_98002 0.8791244743615934
sqft_lot_log_scaled^2 0.879319170247882
sqft_lot_log_scaled grade_log_scaled 0.8790575548223337
sqft_lot_log_scaled sqft_above_log_scaled 0.

In [13]:
#Sorting out top_20_features

potential_features = pd.DataFrame(useful_features)

potential_features['useful_scores'] = useful_scores

potential_features.columns =['potential_features', 'useful_scores']

top_features = list(potential_features.sort_values(by = 'useful_scores', ascending = False).head(1)
                       ['potential_features'])

top_features

['grade_log_scaled sqft_above_log_scaled']

In [14]:
#Establish improved baseline score with all new features

X_improved = pd.concat([X, X2_poly_df[useful_features]], axis = 1)

np.mean(cross_val_score(linreg, x_improve, y, cv = 10, scoring='r2'))


0.8790568123558534

In [15]:
#Estimate effect of only using the top features

X_improved = pd.concat([X, X2_poly_df[top_features]], axis = 1)

np.mean(cross_val_score(linreg, X_improved, y, cv = 10, scoring='r2'))


0.8827504703134753

## Interactions and Polynomial Analysis

After having iterated through the model several times, it is clear that the only meaningful iteration is
the relationship between Grade and Square Footage. This absolutely makes sense as the quality and size of the
house (apart from its location) is a meaningful interaction.

In [16]:
#Weeding out useless features

X_improved.columns

#Isolating non Zip Code Columns

non_zip_features = []
zip_features = []

for item in X_improved.columns:
    if 'zip' not in item:
        non_zip_features.append(item)
    else:
        zip_features.append(item)
        
non_zip_features

#Determining the effect of removing each feature from the model

isolated_item = []
score_without_item = []


for item in non_zip_features:
    isolated_features = list(X_improved.columns)
    isolated_features.remove(item)
    isolated_score = np.mean(cross_val_score(linreg, X_improved[isolated_features], y, cv = 10, scoring= 'r2'))
    isolated_item.append(item)
    score_without_item.append(isolated_score)

    
isolated_item = pd.DataFrame(isolated_item)
score_without_item = pd.DataFrame(score_without_item)
isolation_trial_df = pd.concat([isolated_item, score_without_item], axis = 1)
isolation_trial_df.columns = ['Feature', 'Model_Score_Without_Feature']

In [17]:
#Isolate Core Features With Real Effect on Model

isolation_trial_df['Keep'] = np.where(isolation_trial_df['Model_Score_Without_Feature'] <=0.882, 'Keep', 'Discard')

core_features = list(isolation_trial_df[isolation_trial_df['Keep'] == 'Keep']['Feature'])

core_features.extend(zip_features)

print(np.mean(cross_val_score(linreg, X_improved[core_features], y, cv = 10)))

print(isolation_trial_df[isolation_trial_df['Keep'] == 'Keep'])


0.8802775734439958
                                   Feature  Model_Score_Without_Feature  Keep
0                   sqft_living_log_scaled                     0.875372  Keep
1                      sqft_lot_log_scaled                     0.880496  Keep
2                         grade_log_scaled                     0.874439  Keep
3                    sqft_above_log_scaled                     0.880359  Keep
4                 sqft_living15_log_scaled                     0.879960  Keep
10                              waterfront                     0.878093  Keep
11                               condition                     0.877999  Keep
12                                    view                     0.876692  Keep
15  grade_log_scaled sqft_above_log_scaled                     0.879057  Keep


In [18]:
# Evaluate Model

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

linreg.fit(X_improved[core_features], y)

y_train_pred = linreg.predict(X_improved[core_features])

print("Training Scores:")
print(f"R2: {r2_score(y, y_train_pred)}")
print(f"Mean Absolute Error: {mean_absolute_error(y, y_train_pred)}")
# print("---")
# print("Testing Scores:")
# print(f"R2: {r2_score(y_test_zip, y_test_pred)}")
# print(f"Mean Absolute Error: {mean_absolute_error(y_test_zip, y_test_pred)}")

Training Scores:
R2: 0.8826441207194121
Mean Absolute Error: 0.25194560230926816


In [19]:
forecast_sample = data.head(1)

In [26]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

forecast_cont = forecast_sample[cont_data]

# log features

log_names = [f'{column}_log' for column in forecast_cont.columns]

forecast_cont_log = np.log(forecast_cont.astype(float)) # won't work unless float
forecast_cont_log.columns = log_names

# normalizing... can just use our scaler

forecast_cont_log_scaled = scaler.transform(forecast_cont_log)

forecast_cont_log_scaled

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
#Recreate Features Data Set Model 5

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

#One Hot (So Hawt) Encode (for forecast)

forecast_ohe = pd.get_dummies(forecast_sample[cat_data])

#Create Cont Dataset (for forecast)

forecast_cont = forecast_sample[cont_data]

# #Log Data

forecast_logged_features = []

for item in list(forecast_cont.columns):
    forecast_cont[f'{item}_log'] = np.log(forecast_cont[item])
    forecast_logged_features.append(f'{item}_log')
    
forecast_scaled_features = []
for item in forecast_logged_features:
    forecast_cont[f'{item}_scaled'] = StandardScaler().fit_transform(forecast_cont[item].values.reshape(-1, 1))
    forecast_scaled_features.append(f'{item}_scaled')

#Create Features DataFrame
    
forecast_features_df = pd.concat([forecast_cont[scaled_features], forecast_ohe], axis = 1)

# #Establish X and y

# X = forecast_features_df
# X = X.drop(['price_log_scaled'], axis = 1)
# y = features_df['price_log_scaled']

# #Instantiate Linear Regression and execute Cross Val Score for Model Validation

# linreg = LinearRegression()

# baseline = np.mean(cross_val_score(linreg, X, y, cv = 10, scoring='r2'))

# baseline

forecast_features_df