In [357]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import Imputer, scale
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, accuracy_score
import statsmodels.api as sm

# 1. Prepare regression functions

In [358]:
def split(var, res, t_size, r_state):
    X_train, X_test, y_train, y_test = train_test_split(var, res, test_size=t_size, random_state=r_state)
    return X_train, X_test, y_train, y_test

In [359]:
def linear_regression(X_train, X_test, y_train, y_test):
    reg = LinearRegression()
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print('Simple Linear Regression')
    print('R^2: {}'.format(reg.score(X_train, y_train)))
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error: {}'.format(rmse))
    return reg.coef_

In [360]:
def ridge(X_train, X_test, y_train, y_test, alpha):
    regr_cv = RidgeCV(alphas=alpha, normalize=True)
    ridge = regr_cv.fit(X_train, y_train)
    ridge_pred=ridge.predict(X_test)
    print('Ridge')
    print('R^2: {}'.format(ridge.score(X_train, y_train)))
    rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
    print('Root Mean Squared Error: {}'.format(rmse))
    print('Alpha: {}'.format(ridge.alpha_))
    return ridge.coef_

In [361]:
def lasso(X_train, X_test, y_train, y_test, alpha):
    regr_cv = LassoCV(alphas=alpha, normalize=True)
    lasso = regr_cv.fit(X_train, y_train)
    lasso_pred=lasso.predict(X_test)
    print('Lasso')
    print('R^2: {}'.format(lasso.score(X_train, y_train)))
    rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
    print('Root Mean Squared Error: {}'.format(rmse))
    print('Alpha: {}'.format(lasso.alpha_))
    return lasso.coef_

# 2. Preprocessing the data

In [362]:
filename = 'Preprocessing Data/cleaned_data.csv'
df = pd.read_csv(filename, index_col=0)
df['ZIP'] = df['ZIP'].astype(str)

## 2.1 Seperate property type

In [363]:
df_sfh = df.loc[df['PROPERTY TYPE'] == 'SFH']
df_th = df.loc[df['PROPERTY TYPE'] == 'TH']
df_condo = df.loc[df['PROPERTY TYPE'] == 'Condo']

df_sfh = df_sfh.drop(['PROPERTY TYPE'], axis=1)
df_th = df_th.drop(['PROPERTY TYPE'], axis=1)
df_condo = df_condo.drop(['PROPERTY TYPE'], axis=1)

In [399]:
df_condo.loc[df_condo.ZIP=='95148']

Unnamed: 0,ZIP,PRICE IN K,BEDS,BATHS,SQUARE FEET,YEAR BUILT,HOA/MONTH
1397,95148,372.0,2.0,1.0,841.0,1979.0,225.0


In [364]:
#delete some columns are not available for the specific property type
df_sfh = df_sfh.drop(['HOA/MONTH', 'COUNTY', 'CITY', 'LOT', 'HOA'], axis=1)
df_th = df_th.drop(['LOT SIZE', 'COUNTY', 'CITY', 'LOT', 'HOA'], axis=1)
df_condo = df_condo.drop(['LOT SIZE', 'COUNTY', 'CITY', 'LOT', 'HOA'], axis=1)

In [365]:
#encoding dummy variables
df_sfh_dummies = pd.get_dummies(df_sfh)
df_th_dummies = pd.get_dummies(df_th)
df_condo_dummies = pd.get_dummies(df_condo)
df_sfh.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26790 entries, 0 to 32413
Data columns (total 7 columns):
ZIP            26790 non-null object
PRICE IN K     26790 non-null float64
BEDS           26790 non-null float64
BATHS          26790 non-null float64
SQUARE FEET    26790 non-null float64
LOT SIZE       26790 non-null float64
YEAR BUILT     26790 non-null float64
dtypes: float64(6), object(1)
memory usage: 1.6+ MB


In [366]:
#split dataset into feature variables and results
df_sfh_y = df_sfh_dummies['PRICE IN K'].values.ravel() 
df_sfh_X = df_sfh_dummies.drop(['PRICE IN K'], axis=1)
sfh_col = df_sfh_X.columns

df_th_y = df_th_dummies['PRICE IN K'].values.ravel()
df_th_X = df_th_dummies.drop(['PRICE IN K'], axis=1)
th_col = df_th_X.columns

df_condo_y = df_condo_dummies['PRICE IN K'].values.ravel()
df_condo_X = df_condo_dummies.drop(['PRICE IN K'], axis=1)
condo_col = df_condo_X.columns

In [367]:
# scaling the data
df_sfh_X = scale(df_sfh_X)
df_th_X = scale(df_th_X)
df_condo_X = scale(df_condo_X)

## 2.2 Use different location info but keep three property types together

2.2.1 Using county as the only location info

In [368]:
df_county = df.drop(['CITY', 'ZIP'], axis=1)
df_county_dummies = pd.get_dummies(df_county)
#split dataset into feature variables and results
df_county_y = df_county_dummies['PRICE IN K']
df_county_X = df_county_dummies.drop(['PRICE IN K'], axis=1)
county_col = df_county_X.columns
# scaling the data
df_county_X = scale(df_county_X)

2.2.2 Using city as the only location info

In [369]:
df_city = df.drop(['COUNTY', 'ZIP'], axis=1)
df_city_dummies = pd.get_dummies(df_city)
#split dataset into feature variables and results
df_city_y = df_city_dummies['PRICE IN K']
df_city_X = df_city_dummies.drop(['PRICE IN K'], axis=1)
city_col = df_city_X.columns
# scaling the data
df_city_X = scale(df_city_X)

2.2.3 Using zip as the only location info

In [370]:
df_zip = df.drop(['COUNTY', 'CITY'], axis=1)
df_zip_dummies = pd.get_dummies(df_zip)
#split dataset into feature variables and results
df_zip_y = df_zip_dummies['PRICE IN K']
df_zip_X = df_zip_dummies.drop(['PRICE IN K'], axis=1)
zip_col = df_zip_X.columns
# scaling the data
df_zip_X = scale(df_zip_X)

## 2.3 Keep all location info

In [371]:
df_dummies = pd.get_dummies(df)
df_y = df_dummies['PRICE IN K']
df_X = df_dummies.drop(['PRICE IN K'], axis=1)
df_col = df_X.columns
df_X = scale(df_X)

# 3. Regression

In [372]:
alpha = 10**np.linspace(10,-2,100)*0.5

## 3.1 Based on different property type, no city or county, only zip is included for location

3.1.1 Single family house

In [373]:
X_train, X_test, y_train, y_test = split(df_sfh_X, df_sfh_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":sfh_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.7107405373123307
Root Mean Squared Error: 635.1811327822406
     Coefficients      Feature
0   -1.233617e+02         BEDS
1   -1.784841e+00        BATHS
2    8.719384e+02  SQUARE FEET
3    6.223245e+01     LOT SIZE
4   -9.086694e+01   YEAR BUILT
5    9.428849e+13    ZIP_94002
6    4.800715e+13    ZIP_94005
7    1.110637e+14    ZIP_94010
8    7.332316e+13    ZIP_94014
9    9.048697e+13    ZIP_94015
10   3.930803e+13    ZIP_94018
11   9.603295e+13    ZIP_94019
12   3.467810e+13    ZIP_94020
13   1.071330e+13    ZIP_94021
14   8.767865e+13    ZIP_94022
15   8.037470e+13    ZIP_94024
16   1.130100e+14    ZIP_94025
17   6.268983e+13    ZIP_94027
18   6.605229e+13    ZIP_94028
19   8.002594e+13    ZIP_94030
20   4.557474e+13    ZIP_94037
21   4.002709e+13    ZIP_94038
22   4.557474e+13    ZIP_94040
23   3.356167e+13    ZIP_94041
24   4.255854e+13    ZIP_94043
25   9.923806e+13    ZIP_94044
26   2.354291e+13    ZIP_94060
27   9.130138e+13    ZIP_94061
28   9.84

In [374]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":sfh_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.7107016083096023
Root Mean Squared Error: 634.4700482990442
Alpha: 0.005
     Coefficients      Feature
0     -119.932414         BEDS
1        9.080075        BATHS
2      856.247734  SQUARE FEET
3       62.661550     LOT SIZE
4      -89.922958   YEAR BUILT
5       29.535828    ZIP_94002
6      -21.025422    ZIP_94005
7      119.475792    ZIP_94010
8      -31.698351    ZIP_94014
9      -23.422644    ZIP_94015
10     -21.640423    ZIP_94018
11     -52.865100    ZIP_94019
12     -25.410860    ZIP_94020
13      -7.300918    ZIP_94021
14     167.073072    ZIP_94022
15     121.353910    ZIP_94024
16     134.106685    ZIP_94025
17     316.646199    ZIP_94027
18      84.914109    ZIP_94028
19      14.606943    ZIP_94030
20     -17.909046    ZIP_94037
21     -14.229805    ZIP_94038
22      54.119043    ZIP_94040
23      24.850119    ZIP_94041
24      36.075377    ZIP_94043
25     -26.640033    ZIP_94044
26      -7.004421    ZIP_94060
27      31.050077    ZIP_94061
28     117.6466

In [375]:
def coef_df(coefficients):
    other_coef = coefficients[:5]
    other_coef = other_coef.sort_values(by=['Coefficients'], ascending = False)
    zip_coef = coefficients[5:]
    zip_coef = zip_coef.sort_values(by=['Coefficients'], ascending = False)
    coef = pd.concat([other_coef, zip_coef])
    return coef

In [376]:
coef_df(coefficients)
coef_df(coefficients).to_csv('Coef results/sfh_ridge_coef.csv')

In [377]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":sfh_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.7106659003436058
Root Mean Squared Error: 634.8508099403866
Alpha: 0.006609705742330144
     Coefficients      Feature
0     -121.319300         BEDS
1        0.000000        BATHS
2      869.364880  SQUARE FEET
3       61.176652     LOT SIZE
4      -90.509318   YEAR BUILT
5       53.442352    ZIP_94002
6       -7.573918    ZIP_94005
7      147.313398    ZIP_94010
8      -11.610079    ZIP_94014
9        0.000000    ZIP_94015
10     -10.422977    ZIP_94018
11     -27.220202    ZIP_94019
12     -15.502542    ZIP_94020
13      -3.591074    ZIP_94021
14     189.273834    ZIP_94022
15     141.609337    ZIP_94024
16     163.352991    ZIP_94025
17     332.136469    ZIP_94027
18     101.153558    ZIP_94028
19      34.594917    ZIP_94030
20      -5.094361    ZIP_94037
21      -2.895654    ZIP_94038
22      65.329562    ZIP_94040
23      32.734106    ZIP_94041
24      46.541700    ZIP_94043
25      -0.000000    ZIP_94044
26      -0.000000    ZIP_94060
27      54.314672    ZIP_94061


3.1.2 Townhouse

In [378]:
X_train, X_test, y_train, y_test = split(df_th_X, df_th_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":th_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.8460596238597562
Root Mean Squared Error: 571077151924571.6
     Coefficients      Feature
0    1.330961e+01         BEDS
1    5.425633e+01        BATHS
2    1.341800e+02  SQUARE FEET
3    3.583562e+01   YEAR BUILT
4    8.911092e+00    HOA/MONTH
5    3.349869e+14    ZIP_94002
6    2.094262e+14    ZIP_94005
7    3.615787e+14    ZIP_94010
8    2.238351e+14    ZIP_94014
9    3.862781e+14    ZIP_94015
10   4.241258e+14    ZIP_94019
11   2.957010e+14    ZIP_94022
12   1.770780e+14    ZIP_94024
13   6.481681e+14    ZIP_94025
14   3.615787e+14    ZIP_94040
15   1.939351e+14    ZIP_94041
16   7.207349e+14    ZIP_94043
17   5.805831e+14    ZIP_94044
18   4.094277e+14    ZIP_94061
19   3.111060e+14    ZIP_94062
20   2.094262e+14    ZIP_94063
21   3.700028e+14    ZIP_94065
22   3.782318e+14    ZIP_94070
23   3.862781e+14    ZIP_94080
24   2.622905e+14    ZIP_94085
25   4.168449e+14    ZIP_94086
26   5.147863e+14    ZIP_94087
27   1.584194e+14    ZIP_94089
28   1.86

In [379]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":th_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.8460431758761433
Root Mean Squared Error: 141001.67967930826
Alpha: 0.01155064850041579
     Coefficients      Feature
0       14.600450         BEDS
1       54.425817        BATHS
2      131.734608  SQUARE FEET
3       36.334099   YEAR BUILT
4       10.658488    HOA/MONTH
5       11.045973    ZIP_94002
6       -8.271434    ZIP_94005
7       20.578974    ZIP_94010
8       -3.012672    ZIP_94014
9      -19.221647    ZIP_94015
10     -14.482672    ZIP_94019
11      63.597879    ZIP_94022
12      21.981987    ZIP_94024
13     118.666802    ZIP_94025
14      45.591873    ZIP_94040
15      26.733932    ZIP_94041
16      62.860959    ZIP_94043
17     -16.703942    ZIP_94044
18      11.158658    ZIP_94061
19  -44599.931977    ZIP_94062
20      12.068056    ZIP_94063
21      32.690307    ZIP_94065
22      16.558174    ZIP_94070
23     -11.997863    ZIP_94080
24      11.090467    ZIP_94085
25      24.057883    ZIP_94086
26      35.231463    ZIP_94087
27       7.623887    ZIP_94089


In [380]:
coef_df(coefficients)
coef_df(coefficients).to_csv('Coef results/th_ridge_coef.csv')

In [381]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":th_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.845664953990548
Root Mean Squared Error: 163.42760530068205
Alpha: 0.020185086292982747
     Coefficients      Feature
0       10.804771         BEDS
1       54.468769        BATHS
2      136.789477  SQUARE FEET
3       34.572114   YEAR BUILT
4        9.218730    HOA/MONTH
5       19.964231    ZIP_94002
6       -1.731916    ZIP_94005
7       30.202591    ZIP_94010
8        2.649123    ZIP_94014
9       -7.597881    ZIP_94015
10      -1.685519    ZIP_94019
11      71.709356    ZIP_94022
12      26.551763    ZIP_94024
13     136.721746    ZIP_94025
14      55.448413    ZIP_94040
15      31.662113    ZIP_94041
16      83.622678    ZIP_94043
17      -0.000000    ZIP_94044
18      21.783856    ZIP_94061
19       0.000000    ZIP_94062
20      17.555115    ZIP_94063
21      42.758217    ZIP_94065
22      25.970025    ZIP_94070
23      -0.183900    ZIP_94080
24      17.975302    ZIP_94085
25      35.480155    ZIP_94086
26      49.677182    ZIP_94087
27      11.397339    ZIP_94089


3.1.3 Condo

In [382]:
X_train, X_test, y_train, y_test = split(df_condo_X, df_condo_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":condo_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.8496948460291343
Root Mean Squared Error: 3644034738581744.0
     Coefficients      Feature
0    1.036645e+01         BEDS
1    3.884442e+01        BATHS
2    1.446641e+02  SQUARE FEET
3    2.052097e+01   YEAR BUILT
4    6.677334e+00    HOA/MONTH
5   -1.935417e+14    ZIP_94002
6   -3.000973e+14    ZIP_94005
7   -4.665498e+14    ZIP_94010
8   -2.788029e+14    ZIP_94014
9   -3.561724e+14    ZIP_94015
10  -2.842811e+14    ZIP_94019
11  -4.792161e+14    ZIP_94022
12  -1.118766e+14    ZIP_94024
13  -3.340035e+14    ZIP_94025
14  -3.604347e+14    ZIP_94030
15  -5.921934e+14    ZIP_94040
16  -1.581218e+14    ZIP_94041
17  -6.346895e+14    ZIP_94043
18  -2.014141e+14    ZIP_94044
19  -2.788029e+14    ZIP_94061
20  -1.369789e+14    ZIP_94062
21  -1.479318e+14    ZIP_94063
22  -2.233473e+14    ZIP_94065
23  -4.331141e+14    ZIP_94066
24  -4.567979e+14    ZIP_94070
25  -4.665498e+14    ZIP_94080
26  -3.688039e+14    ZIP_94085
27  -4.116412e+14    ZIP_94086
28  -2.7

In [383]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":condo_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.8496215978131213
Root Mean Squared Error: 760143.627684709
Alpha: 0.020185086292982747
      Coefficients      Feature
0        13.832355         BEDS
1        40.741630        BATHS
2       137.109573  SQUARE FEET
3        19.527121   YEAR BUILT
4         8.555912    HOA/MONTH
5         4.752227    ZIP_94002
6       -13.918438    ZIP_94005
7        20.209264    ZIP_94010
8        -7.136609    ZIP_94014
9        -9.822517    ZIP_94015
10       -7.091406    ZIP_94019
11       87.195582    ZIP_94022
12       12.396733    ZIP_94024
13       34.049178    ZIP_94025
14       18.327655    ZIP_94030
15       29.287463    ZIP_94040
16       21.866163    ZIP_94041
17       28.568439    ZIP_94043
18       -7.248851    ZIP_94044
19       -0.661913    ZIP_94061
20       -5.412824    ZIP_94062
21        8.896197    ZIP_94063
22       14.820429    ZIP_94065
23       -3.397499    ZIP_94066
24       16.231758    ZIP_94070
25      -10.512185    ZIP_94080
26        6.969426    ZIP_94085
27  

In [384]:
coef_df(coefficients)
coef_df(coefficients).to_csv('Coef results/condo_ridge_coef.csv')

In [385]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":condo_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.8485672614942809
Root Mean Squared Error: 144.0289072351488
Alpha: 0.020185086292982747
     Coefficients      Feature
0        6.728324         BEDS
1       38.439442        BATHS
2      146.945118  SQUARE FEET
3       15.727651   YEAR BUILT
4        7.443958    HOA/MONTH
5        6.966574    ZIP_94002
6       -7.766122    ZIP_94005
7       27.154104    ZIP_94010
8       -1.240450    ZIP_94014
9       -2.978768    ZIP_94015
10      -0.974181    ZIP_94019
11      95.335704    ZIP_94022
12      13.347716    ZIP_94024
13      38.921431    ZIP_94025
14      23.852957    ZIP_94030
15      38.233910    ZIP_94040
16      23.926205    ZIP_94041
17      38.750200    ZIP_94043
18      -2.915584    ZIP_94044
19       3.009959    ZIP_94061
20      -3.368535    ZIP_94062
21      10.803870    ZIP_94063
22      17.935319    ZIP_94065
23       2.449094    ZIP_94066
24      23.221181    ZIP_94070
25      -1.246833    ZIP_94080
26      12.680858    ZIP_94085
27      13.136105    ZIP_94086


##  3.2 Use different location info but keep three property types together

3.2.1 Using county as the only location info

In [386]:
X_train, X_test, y_train, y_test = split(df_county_X, df_county_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":county_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.5851520349151621
Root Mean Squared Error: 765.0968022095226
    Coefficients               Feature
0  -1.765056e+02                  BEDS
1   1.060228e+02                 BATHS
2   9.600034e+02           SQUARE FEET
3   4.802615e+01              LOT SIZE
4  -1.851334e+02            YEAR BUILT
5  -2.059250e+01             HOA/MONTH
6   3.910678e+10                   LOT
7  -1.112986e+02                   HOA
8  -6.128437e+13   PROPERTY TYPE_Condo
9  -7.600896e+13     PROPERTY TYPE_SFH
10 -5.091506e+13      PROPERTY TYPE_TH
11  1.098049e+14        COUNTY_ALAMEDA
12  3.380108e+13  COUNTY_SAN FRANCISCO
13  1.090363e+14      COUNTY_SAN MATEO
14  1.087984e+14    COUNTY_SANTA CLARA


In [387]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":county_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.585105582997862
Root Mean Squared Error: 765.8331897241667
Alpha: 0.005
    Coefficients               Feature
0    -170.127741                  BEDS
1     114.977471                 BATHS
2     944.245164           SQUARE FEET
3      48.712810              LOT SIZE
4    -183.924129            YEAR BUILT
5     -19.749787             HOA/MONTH
6       0.000000                   LOT
7    -110.334505                   HOA
8      33.315795   PROPERTY TYPE_Condo
9     -41.664836     PROPERTY TYPE_SFH
10     22.122749      PROPERTY TYPE_TH
11   -193.674915        COUNTY_ALAMEDA
12      7.480368  COUNTY_SAN FRANCISCO
13     88.545106      COUNTY_SAN MATEO
14    104.165951    COUNTY_SANTA CLARA


In [388]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":county_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.5851346658427818
Root Mean Squared Error: 765.0708774329421
Alpha: 0.006609705742330144
    Coefficients               Feature
0    -171.855657                  BEDS
1     103.415231                 BATHS
2     958.186597           SQUARE FEET
3      47.347250              LOT SIZE
4    -183.190383            YEAR BUILT
5     -15.865885             HOA/MONTH
6       0.000000                   LOT
7    -108.701302                   HOA
8       4.757650   PROPERTY TYPE_Condo
9     -71.606231     PROPERTY TYPE_SFH
10      0.000000      PROPERTY TYPE_TH
11   -282.453731        COUNTY_ALAMEDA
12    -18.728239  COUNTY_SAN FRANCISCO
13      0.000000      COUNTY_SAN MATEO
14     15.343432    COUNTY_SANTA CLARA


3.2.2 Using city as the only location info

In [389]:
X_train, X_test, y_train, y_test = split(df_city_X, df_city_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":city_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.7180658626116141
Root Mean Squared Error: 7525830022314.006
    Coefficients                   Feature
0  -1.051020e+02                      BEDS
1   4.418583e+01                     BATHS
2   7.459662e+02               SQUARE FEET
3   5.478715e+01                  LOT SIZE
4  -6.952767e+01                YEAR BUILT
5  -5.634454e+01                 HOA/MONTH
6   2.481204e+12                       LOT
7  -3.086323e+01                       HOA
8  -3.490498e+13       PROPERTY TYPE_Condo
9  -4.329148e+13         PROPERTY TYPE_SFH
10 -2.899906e+13          PROPERTY TYPE_TH
11 -3.472799e+13              CITY_ALAMEDA
12 -2.024382e+13               CITY_ALBANY
13 -1.704780e+13             CITY_ATHERTON
14 -2.670950e+13              CITY_BELMONT
15 -3.328053e+13             CITY_BERKELEY
16 -1.500311e+13             CITY_BRISBANE
17 -1.697425e+12            CITY_BROADMOOR
18 -2.547732e+13           CITY_BURLINGAME
19 -2.869480e+13             CITY_CAMPBELL
20 -9

In [390]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":city_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.7180331347175828
Root Mean Squared Error: 537738.2516947543
Alpha: 0.005
     Coefficients                   Feature
0      -99.892865                      BEDS
1       51.218471                     BATHS
2      732.568660               SQUARE FEET
3       55.098070                  LOT SIZE
4      -68.829297                YEAR BUILT
5      -54.815072                 HOA/MONTH
6        0.000000                       LOT
7      -31.604363                       HOA
8       -6.895872       PROPERTY TYPE_Condo
9        1.463955         PROPERTY TYPE_SFH
10       5.959007          PROPERTY TYPE_TH
11     -55.883245              CITY_ALAMEDA
12     -14.268473               CITY_ALBANY
13     287.176256             CITY_ATHERTON
14      21.990370              CITY_BELMONT
15     -45.053143             CITY_BERKELEY
16     -20.154658             CITY_BRISBANE
17      -3.682889            CITY_BROADMOOR
18      52.686607           CITY_BURLINGAME
19       1.160133             CITY

In [391]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":city_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.7180362512589095
Root Mean Squared Error: 633.3845667903529
Alpha: 0.005
    Coefficients                   Feature
0    -100.817034                      BEDS
1      41.351065                     BATHS
2     746.368168               SQUARE FEET
3      53.949119                  LOT SIZE
4     -69.745118                YEAR BUILT
5     -54.085885                 HOA/MONTH
6       0.000000                       LOT
7     -31.285434                       HOA
8      -6.393752       PROPERTY TYPE_Condo
9       0.000000         PROPERTY TYPE_SFH
10      5.114289          PROPERTY TYPE_TH
11    -30.945792              CITY_ALAMEDA
12     -0.000000               CITY_ALBANY
13    298.521517             CITY_ATHERTON
14     40.316749              CITY_BELMONT
15    -21.243943             CITY_BERKELEY
16     -8.810169             CITY_BRISBANE
17     -1.835482            CITY_BROADMOOR
18     70.007890           CITY_BURLINGAME
19     20.804858             CITY_CAMPBELL
20    -15.2

3.2.3 Using zip as the only location info

In [392]:
X_train, X_test, y_train, y_test = split(df_zip_X, df_zip_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":zip_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.7140411619740954
Root Mean Squared Error: 260989893189831.78
     Coefficients              Feature
0   -1.166116e+02                 BEDS
1    2.743136e+01                BATHS
2    7.893284e+02          SQUARE FEET
3    6.370197e+01             LOT SIZE
4   -6.363184e+01           YEAR BUILT
5   -6.701204e+01            HOA/MONTH
6   -1.025510e+13                  LOT
7   -3.224295e+01                  HOA
8   -1.359123e+14  PROPERTY TYPE_Condo
9   -1.685674e+14    PROPERTY TYPE_SFH
10  -1.129159e+14     PROPERTY TYPE_TH
11  -2.432805e+14            ZIP_94002
12  -1.366541e+14            ZIP_94005
13  -2.964880e+14            ZIP_94010
14  -1.936954e+14            ZIP_94014
15  -2.425654e+14            ZIP_94015
16  -9.827163e+13            ZIP_94018
17  -2.532946e+14            ZIP_94019
18  -8.669163e+13            ZIP_94020
19  -2.677726e+13            ZIP_94021
20  -2.418480e+14            ZIP_94022
21  -2.037099e+14            ZIP_94024
22  -3.035

In [393]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":zip_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.7140129817866487
Root Mean Squared Error: 821823.7473672488
Alpha: 0.005
      Coefficients              Feature
0      -112.924683                 BEDS
1        37.388588                BATHS
2       774.430317          SQUARE FEET
3        64.244162             LOT SIZE
4       -62.784575           YEAR BUILT
5       -65.582100            HOA/MONTH
6         0.000000                  LOT
7       -32.450933                  HOA
8        -4.534887  PROPERTY TYPE_Condo
9        -0.362000    PROPERTY TYPE_SFH
10        5.868509     PROPERTY TYPE_TH
11       22.172860            ZIP_94002
12      -19.722189            ZIP_94005
13      107.861323            ZIP_94010
14      -27.443264            ZIP_94014
15      -23.131777            ZIP_94015
16      -20.530486            ZIP_94018
17      -49.567282            ZIP_94019
18      -15.905312            ZIP_94020
19       -8.523133            ZIP_94021
20      160.605448            ZIP_94022
21      111.740644            ZIP_

In [394]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":zip_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.7139832267842356
Root Mean Squared Error: 637.5154535154204
Alpha: 0.005
     Coefficients              Feature
0     -114.098371                 BEDS
1       26.351572                BATHS
2      788.981255          SQUARE FEET
3       63.015901             LOT SIZE
4      -63.125442           YEAR BUILT
5      -64.824932            HOA/MONTH
6        0.000000                  LOT
7      -32.551681                  HOA
8       -1.745932  PROPERTY TYPE_Condo
9        0.000000    PROPERTY TYPE_SFH
10       6.381497     PROPERTY TYPE_TH
11      41.028403            ZIP_94002
12      -8.110362            ZIP_94005
13     130.544156            ZIP_94010
14     -11.156468            ZIP_94014
15      -2.653761            ZIP_94015
16     -11.952844            ZIP_94018
17     -28.864907            ZIP_94019
18      -8.207409            ZIP_94020
19      -5.458877            ZIP_94021
20     179.261868            ZIP_94022
21     127.458659            ZIP_94024
22     150.146281

# 3.3 Keep all location info

In [395]:
X_train, X_test, y_train, y_test = split(df_X, df_y, 0.2, 42)
linear_coef = linear_regression(X_train, X_test, y_train, y_test)
coefficients = pd.DataFrame({"Feature":df_col,"Coefficients":np.transpose(linear_coef)})
print(coefficients.to_string())

Simple Linear Regression
R^2: 0.7283258875255255
Root Mean Squared Error: 3326962357014995.5
     Coefficients                   Feature
0   -1.037627e+02                      BEDS
1    3.013020e+01                     BATHS
2    7.445445e+02               SQUARE FEET
3    5.451400e+01                  LOT SIZE
4   -5.127678e+01                YEAR BUILT
5   -6.763155e+01                 HOA/MONTH
6    3.218683e+15                       LOT
7   -2.734041e+01                       HOA
8   -8.644702e+11       PROPERTY TYPE_Condo
9   -1.072173e+12         PROPERTY TYPE_SFH
10  -7.182019e+11          PROPERTY TYPE_TH
11  -1.792864e+15              CITY_ALAMEDA
12  -2.593313e+14               CITY_ALBANY
13   4.897258e+14             CITY_ATHERTON
14  -4.607005e+14              CITY_BELMONT
15  -4.263367e+14             CITY_BERKELEY
16  -2.752852e+15             CITY_BRISBANE
17   1.577213e+13            CITY_BROADMOOR
18  -6.621527e+14           CITY_BURLINGAME
19   1.126868e+15          

In [396]:
ridge_coef=ridge(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":df_col,"Coefficients":np.transpose(ridge_coef)})
print(coefficients.to_string())

Ridge
R^2: 0.7290660976394299
Root Mean Squared Error: 1529530.6961629551
Alpha: 0.005
      Coefficients                   Feature
0      -101.198845                      BEDS
1        34.674826                     BATHS
2       733.237741               SQUARE FEET
3        58.176293                  LOT SIZE
4       -53.823630                YEAR BUILT
5       -65.621312                 HOA/MONTH
6         0.000000                       LOT
7       -26.915140                       HOA
8        -7.736343       PROPERTY TYPE_Condo
9         3.625015         PROPERTY TYPE_SFH
10        3.767271          PROPERTY TYPE_TH
11      -13.040266              CITY_ALAMEDA
12        8.636604               CITY_ALBANY
13      200.664853             CITY_ATHERTON
14        5.681109              CITY_BELMONT
15      -13.973174             CITY_BERKELEY
16      -13.157458             CITY_BRISBANE
17       -3.634506            CITY_BROADMOOR
18        4.170181           CITY_BURLINGAME
19       -5.7

In [397]:
lasso_coef=lasso(X_train, X_test, y_train, y_test, alpha)
coefficients = pd.DataFrame({"Feature":df_col,"Coefficients":np.transpose(lasso_coef)})
print(coefficients.to_string())

Lasso
R^2: 0.728830691102647
Root Mean Squared Error: 622.8640551619656
Alpha: 0.008737642000038414
     Coefficients                   Feature
0      -99.668533                      BEDS
1       25.295539                     BATHS
2      747.181082               SQUARE FEET
3       56.185538                  LOT SIZE
4      -56.644316                YEAR BUILT
5      -62.647432                 HOA/MONTH
6        0.000000                       LOT
7      -26.304741                       HOA
8       -9.031371       PROPERTY TYPE_Condo
9        0.000000         PROPERTY TYPE_SFH
10       0.611470          PROPERTY TYPE_TH
11       0.000000              CITY_ALAMEDA
12      18.827265               CITY_ALBANY
13     273.845805             CITY_ATHERTON
14       5.962463              CITY_BELMONT
15       0.509331             CITY_BERKELEY
16     -20.189776             CITY_BRISBANE
17      -2.968429            CITY_BROADMOOR
18       0.000000           CITY_BURLINGAME
19      -0.957770   

In [398]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32307 entries, 115 to 32413
Data columns (total 13 columns):
PROPERTY TYPE    32307 non-null object
CITY             32307 non-null object
ZIP              32307 non-null object
PRICE IN K       32307 non-null float64
BEDS             32307 non-null float64
BATHS            32307 non-null float64
SQUARE FEET      32307 non-null float64
LOT SIZE         32307 non-null float64
YEAR BUILT       32307 non-null float64
HOA/MONTH        32307 non-null float64
COUNTY           32307 non-null object
LOT              32307 non-null int64
HOA              32307 non-null int64
dtypes: float64(7), int64(2), object(4)
memory usage: 4.7+ MB
