In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import Imputer, scale
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, accuracy_score
import statsmodels.api as sm

  from pandas.core import datetools


# 1. Prepare regression functions

In [2]:
def split(var, res, t_size, r_state):
    X_train, X_test, y_train, y_test = train_test_split(var, res, test_size=t_size, random_state=r_state)
    return X_train, X_test, y_train, y_test

In [3]:
def linear_regression(X_train, X_test, y_train, y_test):
    reg = LinearRegression()
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print('Simple Linear Regression')
    print('R^2: {}'.format(reg.score(X_train, y_train)))
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error: {}'.format(rmse))
    return reg.coef_

In [4]:
def ridge(X_train, X_test, y_train, y_test, alpha):
    regr_cv = RidgeCV(alphas=alpha, normalize=True)
    ridge = regr_cv.fit(X_train, y_train)
    ridge_pred=ridge.predict(X_test)
    print('Ridge')
    print('R^2: {}'.format(ridge.score(X_train, y_train)))
    rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
    print('Root Mean Squared Error: {}'.format(rmse))
    print('Alpha: {}'.format(ridge.alpha_))
    return ridge.coef_

In [5]:
def lasso(X_train, X_test, y_train, y_test, alpha):
    regr_cv = LassoCV(alphas=alpha, normalize=True)
    lasso = regr_cv.fit(X_train, y_train)
    lasso_pred=lasso.predict(X_test)
    print('Lasso')
    print('R^2: {}'.format(lasso.score(X_train, y_train)))
    rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
    print('Root Mean Squared Error: {}'.format(rmse))
    print('Alpha: {}'.format(lasso.alpha_))
    return lasso.coef_

# 2. Preprocessing the data

In [8]:
filename = 'Preprocessing Data/cleaned_data.csv'
df = pd.read_csv(filename, index_col=0)
df['ZIP'] = df['ZIP'].astype(str)


## 2.1 Seperate property type

In [9]:
df_sfh = df.loc[df['PROPERTY TYPE'] == 'SFH']
df_th = df.loc[df['PROPERTY TYPE'] == 'TH']
df_condo = df.loc[df['PROPERTY TYPE'] == 'Condo']

df_sfh = df_sfh.drop(['PROPERTY TYPE'], axis=1)
df_th = df_th.drop(['PROPERTY TYPE'], axis=1)
df_condo = df_condo.drop(['PROPERTY TYPE'], axis=1)

In [10]:
#delete some columns are not available for the specific property type
df_sfh = df_sfh.drop(['HOA/MONTH', 'COUNTY', 'CITY', 'LOT', 'HOA'], axis=1)
df_th = df_th.drop(['LOT SIZE', 'COUNTY', 'CITY', 'LOT', 'HOA'], axis=1)
df_condo = df_condo.drop(['LOT SIZE', 'COUNTY', 'CITY', 'LOT', 'HOA'], axis=1)

In [11]:
#encoding dummy variables
df_sfh_dummies = pd.get_dummies(df_sfh)
df_th_dummies = pd.get_dummies(df_th)
df_condo_dummies = pd.get_dummies(df_condo)
df_sfh.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26792 entries, 0 to 32415
Data columns (total 7 columns):
ZIP            26792 non-null object
PRICE IN K     26792 non-null float64
BEDS           26792 non-null float64
BATHS          26792 non-null float64
SQUARE FEET    26792 non-null float64
LOT SIZE       26792 non-null float64
YEAR BUILT     26792 non-null float64
dtypes: float64(6), object(1)
memory usage: 1.6+ MB


In [21]:
#split dataset into feature variables and results
df_sfh_y = df_sfh_dummies['PRICE IN K'].values.ravel() 
df_sfh_X = df_sfh_dummies.drop(['PRICE IN K'], axis=1)
sfh_col = df_sfh_X.columns

df_th_y = df_th_dummies['PRICE IN K'].values.ravel()
df_th_X = df_th_dummies.drop(['PRICE IN K'], axis=1)
th_col = df_th_X.columns

df_condo_y = df_condo_dummies['PRICE IN K'].values.ravel()
df_condo_X = df_condo_dummies.drop(['PRICE IN K'], axis=1)
condo_col = df_condo_X.columns

In [13]:
# scaling the data
df_sfh_X = scale(df_sfh_X)
df_th_X = scale(df_th_X)
df_condo_X = scale(df_condo_X)

## 2.2 Use different location info but keep three property types together

2.2.1 Using county as the only location info

In [27]:
df_county = df.drop(['CITY', 'ZIP'], axis=1)
df_county_dummies = pd.get_dummies(df_county)
#split dataset into feature variables and results
df_county_y = df_county_dummies['PRICE IN K']
df_county_X = df_county_dummies.drop(['PRICE IN K'], axis=1)
county_col = df_county_X.columns
# scaling the data
df_county_X = scale(df_county_X)

2.2.2 Using city as the only location info

In [26]:
df_city = df.drop(['COUNTY', 'ZIP'], axis=1)
df_city_dummies = pd.get_dummies(df_city)
#split dataset into feature variables and results
df_city_y = df_city_dummies['PRICE IN K']
df_city_X = df_city_dummies.drop(['PRICE IN K'], axis=1)
city_col = df_city_X.columns
# scaling the data
df_city_X = scale(df_city_X)

2.2.3 Using zip as the only location info

In [28]:
df_zip = df.drop(['COUNTY', 'CITY'], axis=1)
df_zip_dummies = pd.get_dummies(df_zip)
#split dataset into feature variables and results
df_zip_y = df_zip_dummies['PRICE IN K']
df_zip_X = df_zip_dummies.drop(['PRICE IN K'], axis=1)
zip_col = df_zip_X.columns
# scaling the data
df_zip_X = scale(df_zip_X)

## 2.3 Keep all location info

In [50]:
df_dummies = pd.get_dummies(df)
df_y = df_dummies['PRICE IN K']
df_X = df_dummies.drop(['PRICE IN K'], axis=1)
df_col = df_X.columns
df_X = scale(df_X)

# 3. Regression

In [18]:
alpha = 10**np.linspace(10,-2,100)*0.5

## 3.1 Based on different property type, no city or county, only zip is included for location

3.1.1 Single family house

In [32]:
X_train, X_test, y_train, y_test = split(df_sfh_X, df_sfh_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":sfh_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.7041927195308912
Root Mean Squared Error: 615.9830443294132
     Coefficients      Feature
0   -1.307868e+02         BEDS
1   -2.362748e-01        BATHS
2    8.433540e-01  SQUARE FEET
3    5.599712e-04     LOT SIZE
4   -3.217444e+00   YEAR BUILT
5    2.270818e+02    ZIP_94002
6   -3.033014e+02    ZIP_94005
7    7.847612e+02    ZIP_94010
8   -3.405534e+02    ZIP_94014
9   -1.622065e+02    ZIP_94015
10  -3.890893e+02    ZIP_94018
11  -4.197681e+02    ZIP_94019
12  -4.878419e+02    ZIP_94020
13  -4.681790e+02    ZIP_94021
14   1.403537e+03    ZIP_94022
15   1.111887e+03    ZIP_94024
16   8.815454e+02    ZIP_94025
17   3.533076e+03    ZIP_94027
18   9.001956e+02    ZIP_94028
19   1.320037e+02    ZIP_94030
20  -3.133331e+02    ZIP_94037
21  -2.798800e+02    ZIP_94038
22   8.710829e+02    ZIP_94040
23   5.447080e+02    ZIP_94041
24   6.032451e+02    ZIP_94043
25  -1.715597e+02    ZIP_94044
26  -1.749677e+02    ZIP_94060
27   2.430548e+02    ZIP_94061
28   8.41

In [33]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":sfh_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.7041525091991406
Root Mean Squared Error: 615.1894093126468
Alpha: 0.005
     Coefficients      Feature
0     -126.347067         BEDS
1        9.561111        BATHS
2        0.827860  SQUARE FEET
3        0.000564     LOT SIZE
4       -3.185660   YEAR BUILT
5      218.657736    ZIP_94002
6     -308.726934    ZIP_94005
7      781.217894    ZIP_94010
8     -346.744321    ZIP_94014
9     -172.592256    ZIP_94015
10    -394.040929    ZIP_94018
11    -422.492572    ZIP_94019
12    -489.834850    ZIP_94020
13    -475.742770    ZIP_94021
14    1396.401402    ZIP_94022
15    1103.318951    ZIP_94024
16     870.787297    ZIP_94025
17    3530.217881    ZIP_94027
18     897.603180    ZIP_94028
19     125.455576    ZIP_94030
20    -317.921088    ZIP_94037
21    -284.429348    ZIP_94038
22     859.442242    ZIP_94040
23     538.025808    ZIP_94041
24     590.753094    ZIP_94043
25    -180.592548    ZIP_94044
26    -179.009062    ZIP_94060
27     233.739013    ZIP_94061
28     838.4754

In [34]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":sfh_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.7041161604290995
Root Mean Squared Error: 615.5369161199044
Alpha: 0.006609705742330144
     Coefficients      Feature
0     -127.777663         BEDS
1        0.000000        BATHS
2        0.840700  SQUARE FEET
3        0.000550     LOT SIZE
4       -3.204548   YEAR BUILT
5      388.071574    ZIP_94002
6     -119.743271    ZIP_94005
7      948.642429    ZIP_94010
8     -162.744450    ZIP_94014
9        0.000000    ZIP_94015
10    -202.500065    ZIP_94018
11    -243.823722    ZIP_94019
12    -296.667740    ZIP_94020
13    -240.081224    ZIP_94021
14    1565.766813    ZIP_94022
15    1272.215293    ZIP_94024
16    1044.085201    ZIP_94025
17    3695.077168    ZIP_94027
18    1060.350696    ZIP_94028
19     292.352178    ZIP_94030
20    -129.795097    ZIP_94037
21     -94.450532    ZIP_94038
22    1023.590806    ZIP_94040
23     694.348836    ZIP_94041
24     755.330009    ZIP_94043
25      -0.000000    ZIP_94044
26      -0.000000    ZIP_94060
27     404.016212    ZIP_94061


3.1.2 Townhouse

In [35]:
X_train, X_test, y_train, y_test = split(df_th_X, df_th_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":th_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.8461298173827818
Root Mean Squared Error: 163.76773608514355
     Coefficients      Feature
0    2.227403e+01         BEDS
1    8.950949e+01        BATHS
2    3.536231e-01  SQUARE FEET
3    2.646931e+00   YEAR BUILT
4    8.098575e-02    HOA/MONTH
5    1.266337e+02    ZIP_94002
6   -1.481966e+02    ZIP_94005
7    2.169163e+02    ZIP_94010
8   -4.803867e+01    ZIP_94014
9   -1.849877e+02    ZIP_94015
10  -1.259130e+02    ZIP_94019
11   8.109871e+02    ZIP_94022
12   4.735172e+02    ZIP_94024
13   6.897360e+02    ZIP_94025
14   4.780338e+02    ZIP_94040
15   5.211858e+02    ZIP_94041
16   3.312900e+02    ZIP_94043
17  -1.050380e+02    ZIP_94044
18   1.040715e+02    ZIP_94061
19  -3.664839e-13    ZIP_94062
20   2.178049e+02    ZIP_94063
21   3.348052e+02    ZIP_94065
22   1.652288e+02    ZIP_94070
23  -1.140416e+02    ZIP_94080
24   1.607444e+02    ZIP_94085
25   2.202550e+02    ZIP_94086
26   2.609736e+02    ZIP_94087
27   1.806021e+02    ZIP_94089
28   1.1

In [36]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":th_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.8460431758761436
Root Mean Squared Error: 163.23280234129493
Alpha: 0.01155064850041579
     Coefficients      Feature
0       22.951711         BEDS
1       90.709954        BATHS
2        0.347337  SQUARE FEET
3        2.621867   YEAR BUILT
4        0.098677    HOA/MONTH
5      122.703372    ZIP_94002
6     -146.970489    ZIP_94005
7      211.787923    ZIP_94010
8      -50.084589    ZIP_94014
9     -185.170098    ZIP_94015
10    -127.067480    ZIP_94019
11     800.332094    ZIP_94022
12     461.936847    ZIP_94024
13     681.274265    ZIP_94025
14     469.207459    ZIP_94040
15     512.963756    ZIP_94041
16     324.553164    ZIP_94043
17    -107.061911    ZIP_94044
18     101.418024    ZIP_94061
19       0.000000    ZIP_94062
20     214.430542    ZIP_94063
21     328.771617    ZIP_94065
22     162.905139    ZIP_94070
23    -115.580394    ZIP_94080
24     157.343107    ZIP_94085
25     214.764877    ZIP_94086
26     254.673685    ZIP_94087
27     179.080512    ZIP_94089


In [37]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":th_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.845664953990548
Root Mean Squared Error: 163.42760530068205
Alpha: 0.020185086292982747
     Coefficients      Feature
0       16.984955         BEDS
1       90.781542        BATHS
2        0.360664  SQUARE FEET
3        2.494722   YEAR BUILT
4        0.085348    HOA/MONTH
5      221.771185    ZIP_94002
6      -30.773445    ZIP_94005
7      310.829101    ZIP_94010
8       44.040712    ZIP_94014
9      -73.193540    ZIP_94015
10     -14.788339    ZIP_94019
11     902.409008    ZIP_94022
12     557.967632    ZIP_94024
13     784.928938    ZIP_94025
14     570.645760    ZIP_94040
15     607.524425    ZIP_94041
16     431.746586    ZIP_94043
17      -0.000000    ZIP_94044
18     197.987574    ZIP_94061
19       0.000000    ZIP_94062
20     311.927018    ZIP_94063
21     430.026193    ZIP_94065
22     255.502236    ZIP_94070
23      -1.771583    ZIP_94080
24     255.019898    ZIP_94085
25     316.731573    ZIP_94086
26     359.095817    ZIP_94087
27     267.716642    ZIP_94089


3.1.3 Condo

In [38]:
X_train, X_test, y_train, y_test = split(df_condo_X, df_condo_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":condo_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.8499255076777208
Root Mean Squared Error: 143.74497762364484
     Coefficients      Feature
0    1.620376e+01         BEDS
1    6.805300e+01        BATHS
2    4.324132e-01  SQUARE FEET
3    1.306473e+00   YEAR BUILT
4    2.688309e-02    HOA/MONTH
5    1.106271e+02    ZIP_94002
6   -1.248945e+02    ZIP_94005
7    1.696394e+02    ZIP_94010
8   -5.338251e+01    ZIP_94014
9   -5.842531e+01    ZIP_94015
10  -4.927922e+01    ZIP_94019
11   6.247019e+02    ZIP_94022
12   3.937690e+02    ZIP_94024
13   3.642356e+02    ZIP_94025
14   1.951158e+02    ZIP_94030
15   1.929418e+02    ZIP_94040
16   4.818867e+02    ZIP_94041
17   1.795322e+02    ZIP_94043
18  -8.972986e+01    ZIP_94044
19   2.307646e+01    ZIP_94061
20  -1.169228e+02    ZIP_94062
21   2.225284e+02    ZIP_94063
22   2.478651e+02    ZIP_94065
23   7.036329e+00    ZIP_94066
24   1.469025e+02    ZIP_94070
25  -4.275478e+01    ZIP_94080
26   9.293712e+01    ZIP_94085
27   8.533854e+01    ZIP_94086
28   1.7

In [39]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":condo_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.8496215978131221
Root Mean Squared Error: 143.7823414571578
Alpha: 0.020185086292982747
     Coefficients      Feature
0       20.214531         BEDS
1       72.710304        BATHS
2        0.411159  SQUARE FEET
3        1.252959   YEAR BUILT
4        0.034129    HOA/MONTH
5       79.117127    ZIP_94002
6     -149.443255    ZIP_94005
7      139.572661    ZIP_94010
8      -82.478824    ZIP_94014
9      -88.860834    ZIP_94015
10     -80.377086    ZIP_94019
11     586.287830    ZIP_94022
12     357.039084    ZIP_94024
13     328.475822    ZIP_94025
14     163.843086    ZIP_94030
15     159.355123    ZIP_94040
16     445.583059    ZIP_94041
17     145.035081    ZIP_94043
18    -115.965075    ZIP_94044
19      -7.649827    ZIP_94061
20    -127.326280    ZIP_94062
21     193.771883    ZIP_94063
22     213.809976    ZIP_94065
23     -25.275797    ZIP_94066
24     114.495747    ZIP_94070
25     -72.601045    ZIP_94080
26      60.890471    ZIP_94085
27      52.668944    ZIP_94086


In [40]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":condo_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.848567261494281
Root Mean Squared Error: 144.02890723514878
Alpha: 0.020185086292982747
     Coefficients      Feature
0        9.832738         BEDS
1       68.601663        BATHS
2        0.440653  SQUARE FEET
3        1.009166   YEAR BUILT
4        0.029694    HOA/MONTH
5      115.982531    ZIP_94002
6      -83.385406    ZIP_94005
7      187.536292    ZIP_94010
8      -14.336059    ZIP_94014
9      -26.947858    ZIP_94015
10     -11.041793    ZIP_94019
11     641.020584    ZIP_94022
12     384.428393    ZIP_94024
13     375.478939    ZIP_94025
14     213.237431    ZIP_94030
15     208.033363    ZIP_94040
16     487.562069    ZIP_94041
17     196.725425    ZIP_94043
18     -46.642689    ZIP_94044
19      34.786529    ZIP_94061
20     -79.238312    ZIP_94062
21     235.323727    ZIP_94063
22     258.747572    ZIP_94065
23      18.220107    ZIP_94066
24     163.797814    ZIP_94070
25      -8.611090    ZIP_94080
26     110.790093    ZIP_94085
27     102.824261    ZIP_94086


##  3.2 Use different location info but keep three property types together

3.2.1 Using county as the only location info

In [41]:
X_train, X_test, y_train, y_test = split(df_county_X, df_county_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":county_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.5849331210933051
Root Mean Squared Error: 768.6122851666054
    Coefficients               Feature
0  -1.757964e+02                  BEDS
1   7.137616e+01                 BATHS
2   9.770633e+02           SQUARE FEET
3   3.857218e+01              LOT SIZE
4  -1.779170e+02            YEAR BUILT
5  -2.313948e+01             HOA/MONTH
6   5.219218e+12                   LOT
7  -1.102825e+02                   HOA
8  -2.643202e+15   PROPERTY TYPE_Condo
9  -3.278285e+15     PROPERTY TYPE_SFH
10 -2.195970e+15      PROPERTY TYPE_TH
11 -1.075667e+15        COUNTY_ALAMEDA
12 -3.311006e+14  COUNTY_SAN FRANCISCO
13 -1.068087e+15      COUNTY_SAN MATEO
14 -1.065757e+15    COUNTY_SANTA CLARA


In [42]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":county_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.5851022030546079
Root Mean Squared Error: 768.3835109861017
Alpha: 0.005
    Coefficients               Feature
0    -173.596116                  BEDS
1      96.638714                 BATHS
2     963.257711           SQUARE FEET
3      39.128928              LOT SIZE
4    -177.410357            YEAR BUILT
5     -20.993383             HOA/MONTH
6       0.000000                   LOT
7    -108.282272                   HOA
8      31.573817   PROPERTY TYPE_Condo
9     -40.265234     PROPERTY TYPE_SFH
10     22.112727      PROPERTY TYPE_TH
11   -192.958054        COUNTY_ALAMEDA
12      7.940489  COUNTY_SAN FRANCISCO
13     88.866926      COUNTY_SAN MATEO
14    102.427076    COUNTY_SANTA CLARA


In [43]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":county_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.5850483288352191
Root Mean Squared Error: 768.3360457241216
Alpha: 0.015269277544167062
    Coefficients               Feature
0    -170.019510                  BEDS
1      79.443534                 BATHS
2     975.316525           SQUARE FEET
3      36.867942              LOT SIZE
4    -174.243780            YEAR BUILT
5     -10.746063             HOA/MONTH
6       0.000000                   LOT
7    -103.326630                   HOA
8       1.318424   PROPERTY TYPE_Condo
9     -62.795382     PROPERTY TYPE_SFH
10      0.000000      PROPERTY TYPE_TH
11   -281.595592        COUNTY_ALAMEDA
12    -16.529802  COUNTY_SAN FRANCISCO
13      0.000000      COUNTY_SAN MATEO
14     12.139720    COUNTY_SANTA CLARA


3.2.2 Using city as the only location info

In [44]:
X_train, X_test, y_train, y_test = split(df_city_X, df_city_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":city_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.7190211519265912
Root Mean Squared Error: 642.7695276667723
    Coefficients                   Feature
0  -9.676342e+01                      BEDS
1   2.020482e+01                     BATHS
2   7.615612e+02               SQUARE FEET
3   4.532207e+01                  LOT SIZE
4  -6.795604e+01                YEAR BUILT
5  -5.775225e+01                 HOA/MONTH
6  -4.085878e+13                       LOT
7  -3.120107e+01                       HOA
8  -1.407309e+15       PROPERTY TYPE_Condo
9  -1.745443e+15         PROPERTY TYPE_SFH
10 -1.169191e+15          PROPERTY TYPE_TH
11 -4.562557e+13              CITY_ALAMEDA
12 -2.659628e+13               CITY_ALBANY
13 -2.239736e+13             CITY_ATHERTON
14 -3.509088e+13              CITY_BELMONT
15 -4.372389e+13             CITY_BERKELEY
16 -1.971104e+13             CITY_BRISBANE
17 -2.230073e+12            CITY_BROADMOOR
18 -3.347204e+13           CITY_BURLINGAME
19 -3.769916e+13             CITY_CAMPBELL
20 -1

In [45]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":city_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.7190604099922211
Root Mean Squared Error: 642.548667124333
Alpha: 0.005
    Coefficients                   Feature
0     -99.944605                      BEDS
1      39.605322                     BATHS
2     748.415376               SQUARE FEET
3      46.243613                  LOT SIZE
4     -67.550961                YEAR BUILT
5     -55.882533                 HOA/MONTH
6       0.000000                       LOT
7     -31.011278                       HOA
8      -6.471388       PROPERTY TYPE_Condo
9       0.616498         PROPERTY TYPE_SFH
10      6.706848          PROPERTY TYPE_TH
11    -56.657155              CITY_ALAMEDA
12    -14.707446               CITY_ALBANY
13    299.302479             CITY_ATHERTON
14     23.580602              CITY_BELMONT
15    -43.193133             CITY_BERKELEY
16    -19.274773             CITY_BRISBANE
17     -3.764049            CITY_BROADMOOR
18     52.768046           CITY_BURLINGAME
19     -0.460344             CITY_CAMPBELL
20    -23.08

In [46]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":city_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.7190024742201006
Root Mean Squared Error: 642.4307932826421
Alpha: 0.008737642000038414
    Coefficients                   Feature
0     -98.281092                      BEDS
1      27.582496                     BATHS
2     762.652135               SQUARE FEET
3      44.451523                  LOT SIZE
4     -68.507639                YEAR BUILT
5     -53.546124                 HOA/MONTH
6       0.000000                       LOT
7     -30.961668                       HOA
8      -4.355346       PROPERTY TYPE_Condo
9       0.000000         PROPERTY TYPE_SFH
10      5.813026          PROPERTY TYPE_TH
11    -31.590207              CITY_ALAMEDA
12     -0.000000               CITY_ALBANY
13    309.878102             CITY_ATHERTON
14     41.042944              CITY_BELMONT
15    -19.124983             CITY_BERKELEY
16     -7.480894             CITY_BRISBANE
17     -1.391086            CITY_BROADMOOR
18     69.213580           CITY_BURLINGAME
19     18.209681             CITY_CAMPB

3.2.3 Using zip as the only location info

In [47]:
X_train, X_test, y_train, y_test = split(df_zip_X, df_zip_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":zip_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.7147095123373166
Root Mean Squared Error: 318280779212354.1
     Coefficients              Feature
0   -1.042169e+02                 BEDS
1   -1.814021e+00                BATHS
2    8.015086e+02          SQUARE FEET
3    5.487754e+01             LOT SIZE
4   -5.911551e+01           YEAR BUILT
5   -6.647507e+01            HOA/MONTH
6   -1.114012e+14                  LOT
7   -2.835888e+01                  HOA
8   -1.931480e+15  PROPERTY TYPE_Condo
9   -2.395557e+15    PROPERTY TYPE_SFH
10  -1.604672e+15     PROPERTY TYPE_TH
11   2.439508e+14            ZIP_94002
12   1.370306e+14            ZIP_94005
13   2.973050e+14            ZIP_94010
14   1.942290e+14            ZIP_94014
15   2.432337e+14            ZIP_94015
16   9.854235e+13            ZIP_94018
17   2.539925e+14            ZIP_94019
18   8.693045e+13            ZIP_94020
19   2.685102e+13            ZIP_94021
20   2.425143e+14            ZIP_94022
21   2.042712e+14            ZIP_94024
22   3.0434

In [48]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":zip_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.714785983235428
Root Mean Squared Error: 94969.29942411854
Alpha: 0.005
     Coefficients              Feature
0     -112.446912                 BEDS
1       24.609894                BATHS
2      786.956097          SQUARE FEET
3       54.822379             LOT SIZE
4      -58.744433           YEAR BUILT
5      -65.685207            HOA/MONTH
6        0.000000                  LOT
7      -32.216921                  HOA
8       -4.626358  PROPERTY TYPE_Condo
9       -0.617695    PROPERTY TYPE_SFH
10       6.352478     PROPERTY TYPE_TH
11      23.782947            ZIP_94002
12     -18.846113            ZIP_94005
13     100.955246            ZIP_94010
14     -27.161810            ZIP_94014
15     -23.339921            ZIP_94015
16     -20.233665            ZIP_94018
17     -50.669087            ZIP_94019
18     -14.657490            ZIP_94020
19      -7.053761            ZIP_94021
20     152.389985            ZIP_94022
21     110.908090            ZIP_94024
22     123.719046 

In [49]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":zip_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.7146164830124967
Root Mean Squared Error: 644.7071412717312
Alpha: 0.008737642000038414
     Coefficients              Feature
0     -110.780028                 BEDS
1       11.353506                BATHS
2      801.314259          SQUARE FEET
3       52.934638             LOT SIZE
4      -58.830850           YEAR BUILT
5      -63.201966            HOA/MONTH
6        0.000000                  LOT
7      -32.901438                  HOA
8       -0.134651  PROPERTY TYPE_Condo
9        0.000000    PROPERTY TYPE_SFH
10       6.565889     PROPERTY TYPE_TH
11      42.724374            ZIP_94002
12      -6.267621            ZIP_94005
13     123.917801            ZIP_94010
14      -9.813043            ZIP_94014
15      -1.654568            ZIP_94015
16     -10.707746            ZIP_94018
17     -28.662591            ZIP_94019
18      -6.017762            ZIP_94020
19      -3.515715            ZIP_94021
20     171.064914            ZIP_94022
21     126.581911            ZIP_94024
22

# 3.3 Keep all location info

In [51]:
X_train, X_test, y_train, y_test = split(df_X, df_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":df_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.729749579356009
Root Mean Squared Error: 3505353926924246.0
     Coefficients                   Feature
0   -9.164715e+01                      BEDS
1    3.966209e+00                     BATHS
2    7.596230e+02               SQUARE FEET
3    4.876433e+01                  LOT SIZE
4   -5.183713e+01                YEAR BUILT
5   -6.801736e+01                 HOA/MONTH
6   -5.337201e+14                       LOT
7   -3.340126e+01                       HOA
8   -8.687974e+14       PROPERTY TYPE_Condo
9   -1.077543e+15         PROPERTY TYPE_SFH
10  -7.217960e+14          PROPERTY TYPE_TH
11   2.975133e+15              CITY_ALAMEDA
12  -6.936487e+14               CITY_ALBANY
13   3.407825e+14             CITY_ATHERTON
14   1.906112e+15              CITY_BELMONT
15  -1.140348e+15             CITY_BERKELEY
16  -1.809380e+15             CITY_BRISBANE
17   7.833172e+13            CITY_BROADMOOR
18  -2.099246e+15           CITY_BURLINGAME
19  -2.227255e+15           

In [52]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":df_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.7304875652379814
Root Mean Squared Error: 102539.55302553985
Alpha: 0.005
     Coefficients                   Feature
0     -101.056738                      BEDS
1       22.244475                     BATHS
2      748.487517               SQUARE FEET
3       48.931907                  LOT SIZE
4      -51.285160                YEAR BUILT
5      -66.435573                 HOA/MONTH
6        0.000000                       LOT
7      -27.196459                       HOA
8       -7.330644       PROPERTY TYPE_Condo
9        2.814905         PROPERTY TYPE_SFH
10       4.482230          PROPERTY TYPE_TH
11     -13.403181              CITY_ALAMEDA
12       6.691355               CITY_ALBANY
13     220.563868             CITY_ATHERTON
14       6.477242              CITY_BELMONT
15     -15.835281             CITY_BERKELEY
16     -12.680289             CITY_BRISBANE
17      -3.716877            CITY_BROADMOOR
18       7.279785           CITY_BURLINGAME
19      -6.546735             CIT

In [53]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":df_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.7301107813287581
Root Mean Squared Error: 633.8348203768114
Alpha: 0.01155064850041579
     Coefficients                   Feature
0      -97.611482                      BEDS
1       11.957595                     BATHS
2      762.691686               SQUARE FEET
3       46.417516                  LOT SIZE
4      -54.701831                YEAR BUILT
5      -62.118518                 HOA/MONTH
6        0.000000                       LOT
7      -26.358544                       HOA
8       -7.806785       PROPERTY TYPE_Condo
9        0.000000         PROPERTY TYPE_SFH
10       1.294893          PROPERTY TYPE_TH
11      -0.000000              CITY_ALAMEDA
12      18.132876               CITY_ALBANY
13     297.254187             CITY_ATHERTON
14       7.151117              CITY_BELMONT
15       3.664621             CITY_BERKELEY
16     -18.865349             CITY_BRISBANE
17      -2.639909            CITY_BROADMOOR
18       0.000000           CITY_BURLINGAME
19      -1.554021   

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32309 entries, 115 to 32415
Data columns (total 13 columns):
PROPERTY TYPE    32309 non-null object
CITY             32309 non-null object
ZIP              32309 non-null object
PRICE IN K       32309 non-null float64
BEDS             32309 non-null float64
BATHS            32309 non-null float64
SQUARE FEET      32309 non-null float64
LOT SIZE         32309 non-null float64
YEAR BUILT       32309 non-null float64
HOA/MONTH        32309 non-null float64
COUNTY           32309 non-null object
LOT              32309 non-null int64
HOA              32309 non-null int64
dtypes: float64(7), int64(2), object(4)
memory usage: 4.7+ MB
