In [174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import Imputer, scale
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, accuracy_score
import statsmodels.api as sm

  from pandas.core import datetools


# 1. Prepare regression functions

In [143]:
def split(var, res, t_size, r_state):
    X_train, X_test, y_train, y_test = train_test_split(var, res, test_size=t_size, random_state=r_state)
    return X_train, X_test, y_train, y_test

In [171]:
def linear_regression(X_train, X_test, y_train, y_test):
    reg = LinearRegression()
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print('Simple Linear Regression')
    print('R^2: {}'.format(reg.score(X_train, y_train)))
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error: {}'.format(rmse))

In [172]:
def ridge(X_train, X_test, y_train, y_test, alpha):
    regr_cv = RidgeCV(alphas=alpha, normalize=True)
    ridge = regr_cv.fit(X_train, y_train)
    ridge_pred=ridge.predict(X_test)
    print('Ridge')
    print('R^2: {}'.format(ridge.score(X_train, y_train)))
    rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
    print('Root Mean Squared Error: {}'.format(rmse))
    print('Alpha: {}'.format(ridge.alpha_))

In [173]:
def lasso(X_train, X_test, y_train, y_test, alpha):
    regr_cv = LassoCV(alphas=alpha, normalize=True)
    lasso = regr_cv.fit(X_train, y_train)
    lasso_pred=lasso.predict(X_test)
    print('Lasso')
    print('R^2: {}'.format(lasso.score(X_train, y_train)))
    rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
    print('Root Mean Squared Error: {}'.format(rmse))
    print('Alpha: {}'.format(lasso.alpha_))

In [147]:
## def lasso_regression(X_train, X_test, y_train, y_test, alpha):
#     lasso=Lasso(alpha, normalize=True)
#     lasso.fit(X_train, y_train)
#     lasso=lasso.predict(X_test)
#     print('R^2: {}'.format(lasso.score(X_train, y_train)))
#     rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
#     print('Root Mean Squared Error: {}'.format(rmse))

In [148]:
## Hyperparameter tuning using GridSearchCV
# def gridsearch(model, alphas, cv, X_train, X_test, y_train, y_test):
#     grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
#     grid.fit(X_train, y_train)
#     y_pred = grid.predict(X_test)
#     rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#     print('Root Mean Squared Error: {}'.format(rmse))
#     print(grid.best_score_)
#     print(grid.best_estimator_.alpha)

# 2. Preprocessing the data

In [149]:
filename = 'Preprocessing Data/cleaned_data.csv'
df = pd.read_csv(filename, index_col=0)

## 2.1 Seperate property type

In [150]:
df_sfh = df.loc[df['PROPERTY TYPE'] == 'SFH']
df_th = df.loc[df['PROPERTY TYPE'] == 'TH']
df_condo = df.loc[df['PROPERTY TYPE'] == 'Condo']

df_sfh = df_sfh.drop(['PROPERTY TYPE'], axis=1)
df_th = df_th.drop(['PROPERTY TYPE'], axis=1)
df_condo = df_condo.drop(['PROPERTY TYPE'], axis=1)

In [152]:
#delete some columns are not available for the specific property type
df_sfh = df_sfh.drop(['HOA/MONTH', 'COUNTY', 'CITY', 'LOT', 'HOA'], axis=1)
df_th = df_th.drop(['LOT SIZE', 'COUNTY', 'CITY', 'LOT', 'HOA'], axis=1)
df_condo = df_condo.drop(['LOT SIZE', 'COUNTY', 'CITY', 'LOT', 'HOA'], axis=1)

In [153]:
#encoding dummy variables
df_sfh_dummies = pd.get_dummies(df_sfh)
df_th_dummies = pd.get_dummies(df_th)
df_condo_dummies = pd.get_dummies(df_condo)

In [154]:
#split dataset into feature variables and results
df_sfh_y = df_sfh_dummies['PRICE IN K'].values.ravel() 
df_sfh_X = df_sfh_dummies.drop(['PRICE IN K'], axis=1)

df_th_y = df_th_dummies['PRICE IN K'].values.ravel()
df_th_X = df_th_dummies.drop(['PRICE IN K'], axis=1)

df_condo_y = df_condo_dummies['PRICE IN K'].values.ravel()
df_condo_X = df_condo_dummies.drop(['PRICE IN K'], axis=1)

In [155]:
# scaling the data
df_sfh_X = scale(df_sfh_X)
df_th_X = scale(df_th_X)
df_condo_X = scale(df_condo_X)

## 2.2 Use different location info but keep three property types together

2.2.1 Using county as the only location info

In [162]:
df_county = df.drop(['CITY', 'ZIP'], axis=1)
df_county_dummies = pd.get_dummies(df_county)
#split dataset into feature variables and results
df_county_y = df_county_dummies['PRICE IN K']
df_county_X = df_county_dummies.drop(['PRICE IN K'], axis=1)
# scaling the data
df_county_X = scale(df_county_X)

2.2.2 Using city as the only location info

In [163]:
df_city = df.drop(['COUNTY', 'ZIP'], axis=1)
df_city_dummies = pd.get_dummies(df_city)
#split dataset into feature variables and results
df_city_y = df_city_dummies['PRICE IN K']
df_city_X = df_city_dummies.drop(['PRICE IN K'], axis=1)
# scaling the data
df_city_X = scale(df_city_X)

2.2.3 Using zip as the only location info

In [165]:
df_zip = df.drop(['COUNTY', 'CITY'], axis=1)
df_zip_dummies = pd.get_dummies(df_zip)
#split dataset into feature variables and results
df_zip_y = df_zip_dummies['PRICE IN K']
df_zip_X = df_zip_dummies.drop(['PRICE IN K'], axis=1)
# scaling the data
df_zip_X = scale(df_zip_X)

## 2.3 Keep all location info

In [169]:
df_dummies = pd.get_dummies(df)
df_y = df_dummies['PRICE IN K']
df_X = df_dummies.drop(['PRICE IN K'], axis=1)
df_X = scale(df_X)

# 3. Regression

In [None]:
alpha = 10**np.linspace(10,-2,100)*0.5

## 3.1 Based on different property type, no city or county, only zip is included for location

3.1.1 Single family house

In [156]:
X_train, X_test, y_train, y_test = split(df_sfh_X, df_sfh_y, 0.2, 42)
linear_regression(X_train, X_test, y_train, y_test)
ridge(X_train, X_test, y_train, y_test, alpha)
lasso(X_train, X_test, y_train, y_test, alpha)

linear regression
R^2: 0.7020237723213877
Root Mean Squared Error: 9.911459575141237e+16
ridge
R^2: 0.7079371942714945
Root Mean Squared Error: 70035661.00897849
Alpha: 0.005
lasso
R^2: 0.7073935750906402
Root Mean Squared Error: 612.6002262318809
Alpha: 0.01155064850041579


In [175]:
res = sm.OLS(df_sfh_y, df_sfh_X).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.335
Model:                            OLS   Adj. R-squared:                  0.320
Method:                 Least Squares   F-statistic:                     22.49
Date:                Wed, 23 May 2018   Prob (F-statistic):               0.00
Time:                        22:01:19   Log-Likelihood:            -2.3553e+05
No. Observations:               26792   AIC:                         4.722e+05
Df Residuals:                   26204   BIC:                         4.771e+05
Df Model:                         588                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1          -113.8612     14.941     -7.621      0.0

3.1.2 Townhouse

In [157]:
X_train, X_test, y_train, y_test = split(df_th_X, df_th_y, 0.2, 42)
linear_regression(X_train, X_test, y_train, y_test)
ridge(X_train, X_test, y_train, y_test, alpha)
lasso(X_train, X_test, y_train, y_test, alpha)

linear regression
R^2: 0.8514082669860659
Root Mean Squared Error: 8597578450466940.0
ridge
R^2: 0.8514946669865797
Root Mean Squared Error: 94857.60996166918
Alpha: 0.020185086292982747
lasso
R^2: 0.8507158230823169
Root Mean Squared Error: 166.86458882950083
Alpha: 0.026683496156031508


3.1.3 Condo

In [158]:
X_train, X_test, y_train, y_test = split(df_condo_X, df_condo_y, 0.2, 42)
linear_regression(X_train, X_test, y_train, y_test)
ridge(X_train, X_test, y_train, y_test, alpha)
lasso(X_train, X_test, y_train, y_test, alpha)

linear regression
R^2: 0.8529052802847329
Root Mean Squared Error: 7182094952388228.0
ridge
R^2: 0.8528972695304778
Root Mean Squared Error: 111508.0472641165
Alpha: 0.026683496156031508
lasso
R^2: 0.8523873447917717
Root Mean Squared Error: 144.34533029316972
Alpha: 0.015269277544167062


##  3.2 Use different location info but keep three property types together

3.2.1 Using county as the only location info

In [166]:
X_train, X_test, y_train, y_test = split(df_county_X, df_county_y, 0.2, 42)
linear_regression(X_train, X_test, y_train, y_test)
ridge(X_train, X_test, y_train, y_test, alpha)
lasso(X_train, X_test, y_train, y_test, alpha)

linear regression
R^2: 0.584933141516951
Root Mean Squared Error: 768.6122138314541
ridge
R^2: 0.5851022030546079
Root Mean Squared Error: 768.3835109861017
Alpha: 0.005
lasso
R^2: 0.5850483288352191
Root Mean Squared Error: 768.3360457241216
Alpha: 0.015269277544167062


3.2.2 Using city as the only location info

In [167]:
X_train, X_test, y_train, y_test = split(df_city_X, df_city_y, 0.2, 42)
linear_regression(X_train, X_test, y_train, y_test)
ridge(X_train, X_test, y_train, y_test, alpha)
lasso(X_train, X_test, y_train, y_test, alpha)

linear regression
R^2: 0.7190211565770122
Root Mean Squared Error: 642.769510768626
ridge
R^2: 0.719060409992221
Root Mean Squared Error: 642.5486671243328
Alpha: 0.005
lasso
R^2: 0.7190024742201006
Root Mean Squared Error: 642.4307932826421
Alpha: 0.008737642000038414


3.2.3 Using zip as the only location info

In [168]:
X_train, X_test, y_train, y_test = split(df_zip_X, df_zip_y, 0.2, 42)
linear_regression(X_train, X_test, y_train, y_test)
ridge(X_train, X_test, y_train, y_test, alpha)
lasso(X_train, X_test, y_train, y_test, alpha)

linear regression
R^2: 0.7073571449520233
Root Mean Squared Error: 1.1033945722667346e+17
ridge
R^2: 0.7190989862183899
Root Mean Squared Error: 1073909.527087215
Alpha: 0.005
lasso
R^2: 0.718584687045869
Root Mean Squared Error: 647.597088865871
Alpha: 0.008737642000038414


# 3.3 Keep all location info

In [170]:
X_train, X_test, y_train, y_test = split(df_X, df_y, 0.2, 42)
linear_regression(X_train, X_test, y_train, y_test)
ridge(X_train, X_test, y_train, y_test, alpha)
lasso(X_train, X_test, y_train, y_test, alpha)

linear regression
R^2: 0.7205459437044159
Root Mean Squared Error: 9.963008720309867e+16
ridge
R^2: 0.7348853396946371
Root Mean Squared Error: 342360.67443367
Alpha: 0.005
lasso
R^2: 0.7340892840033086
Root Mean Squared Error: 634.7018294548823
Alpha: 0.01155064850041579
