In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import Imputer, scale
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score

# Preprocessing the Data

In [62]:
filename = 'Preprocessing Data/cleaned_data.csv'
df = pd.read_csv(filename, index_col=0)

In [63]:
#seperate property type
df_sfh = df.loc[df['PROPERTY TYPE'] == 'SFH']
df_th = df.loc[df['PROPERTY TYPE'] == 'TH']
df_condo = df.loc[df['PROPERTY TYPE'] == 'Condo']

df_sfh = df_sfh.drop(['PROPERTY TYPE'], axis=1)
df_th = df_th.drop(['PROPERTY TYPE'], axis=1)
df_condo = df_condo.drop(['PROPERTY TYPE'], axis=1)

In [64]:
#delete some columns are not available for the specific property type
# df_sfh = df_sfh.drop(['HOA/MONTH'], axis=1)
df_th = df_th.drop(['LOT SIZE'], axis=1)
df_condo = df_condo.drop(['LOT SIZE'], axis=1)

In [65]:
df_sfh.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26792 entries, 0 to 32415
Data columns (total 12 columns):
CITY           26792 non-null object
ZIP            26792 non-null object
PRICE IN K     26792 non-null float64
BEDS           26792 non-null float64
BATHS          26792 non-null float64
SQUARE FEET    26792 non-null float64
LOT SIZE       26792 non-null float64
YEAR BUILT     26792 non-null float64
HOA/MONTH      26792 non-null float64
COUNTY         26792 non-null object
LOT            26792 non-null int64
HOA            26792 non-null int64
dtypes: float64(7), int64(2), object(3)
memory usage: 2.7+ MB


In [66]:
#encoding dummy variables
df_sfh_dummies = pd.get_dummies(df_sfh)
df_th_dummies = pd.get_dummies(df_th)
df_condo_dummies = pd.get_dummies(df_condo)

In [67]:
df_sfh_dummies.head()

Unnamed: 0,PRICE IN K,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH,LOT,HOA,CITY_ALAMEDA,...,ZIP_95135,ZIP_95136,ZIP_95138,ZIP_95139,ZIP_95148,ZIP_95391,COUNTY_ALAMEDA,COUNTY_SAN FRANCISCO,COUNTY_SAN MATEO,COUNTY_SANTA CLARA
0,100.5,2.0,1.0,1030.0,2500.0,1916.0,0.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,101.0,3.0,2.0,1229.0,5200.0,1951.0,0.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,102.0,2.0,1.0,884.0,6750.0,1913.0,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,102.5,2.0,1.0,657.0,4000.0,1908.0,0.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,102.5,3.0,1.0,928.0,4500.0,1906.0,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [68]:
#split dataset into feature variables and results
y = df_sfh_dummies['PRICE IN K']
X = df_sfh_dummies.drop(['PRICE IN K'], axis=1)

# Regression

In [69]:
# scaling the data
X = scale(X)

In [70]:
# simple linear regression 
# y = y.values.reshape(-1, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7163665964866825
Root Mean Squared Error: 8.953456488587712e+16


In [71]:
# linear regression with cross validation
cv_results = cross_val_score(reg, X, y, cv=10)
print(cv_results)
print(np.mean(cv_results))
y_pred = cross_val_predict(reg, X, y, cv=10)
accuracy = r2_score(y, y_pred) 
print('R^2: {}'.format(accuracy))
rmse = np.sqrt(mean_squared_error(y, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

[ -4.13331438e+29  -1.45061502e+31  -1.78932325e+31  -6.87251679e+30
  -5.24013241e+30  -1.28830729e+30  -5.79102512e+29  -2.04921438e+28
  -3.82316217e+28  -1.14529047e+26]
-4.68516114165e+30
R^2: -4.1719446972561395e+27
Root Mean Squared Error: 8.623502590998024e+16


In [72]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7233389899680862
Root Mean Squared Error: 21020918.569756716


In [73]:
#lasso regression
lasso=Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7115839864290876
Root Mean Squared Error: 599.6690191853619


In [74]:
alphas = 10**np.linspace(10,-2,100)*0.5

In [75]:
# Hyperparameter tuning using GridSearchCV
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)











Root Mean Squared Error: 589.7023065939352
0.712915711724
1.32804389147


In [76]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)

Root Mean Squared Error: 588.6495538497597
0.712719047789
266.83496156


In [77]:
df_sfh_county = df_sfh.drop(['CITY', 'ZIP'], axis=1)
df_sfh_city = df_sfh.drop(['COUNTY', 'ZIP'], axis=1)
df_sfh_zip = df_sfh.drop(['COUNTY', 'CITY'], axis=1)

In [78]:
df_sfh_county = pd.get_dummies(df_sfh_county)
df_sfh_city = pd.get_dummies(df_sfh_city)
df_sfh_zip = pd.get_dummies(df_sfh_zip)

In [79]:
# Regression without city and zip

In [80]:
y = df_sfh_county['PRICE IN K']
X = df_sfh_county.drop(['PRICE IN K'], axis=1)
X = scale(X)

In [81]:
# simple linear regression 
y = y.values.reshape(-1, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.572385169357329
Root Mean Squared Error: 779.0346921325095


In [82]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.5723857828413944
Root Mean Squared Error: 778.8754757186115


In [83]:
#lasso regression
lasso=Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.570743253145787
Root Mean Squared Error: 778.9343183173768


In [84]:
alphas = np.array([1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20])

In [85]:
# Hyperparameter tuning using GridSearchCV
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)



Root Mean Squared Error: 778.5826629933058
0.565361213555
5.0


In [86]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.3993687026400476e-20 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.4309670642352838e-20 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.4152316154821863e-20 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.4070199184781522e-20 / 1.1102230246251565e-16


Root Mean Squared Error: 778.8843238370673
0.565362169191
20.0


In [87]:
#Regression without county and zip

In [88]:
df_sfh_city_dummies = pd.get_dummies(df_sfh_city)
#split dataset into feature variables and results
y = df_sfh_city_dummies['PRICE IN K']
X = df_sfh_city_dummies.drop(['PRICE IN K'], axis=1)

# scaling the data
X = scale(X)

In [89]:
# simple linear regression 
# y = y.values.reshape(-1, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7059788126892457
Root Mean Squared Error: 602.4723215717611


In [90]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7059775470039905
Root Mean Squared Error: 602.2947237172398


In [91]:
#lasso regression
lasso=Lasso(alpha=0.01, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7059021021854477
Root Mean Squared Error: 602.1959521792613


In [92]:
# Hyperparameter tuning using GridSearchCV
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)





Root Mean Squared Error: 602.252946381517
0.702507559002
1.0


In [93]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 8.547020734046404e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 8.727541487154169e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 8.558134907373485e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 8.629975376899197e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 8.546625140920945e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 8.59968157949727e-21 / 1.1102230246251565e-16


Root Mean Squared Error: 602.3044524678971
0.702517294072
20.0


In [94]:
#Regression without county and city

In [95]:
#split dataset into feature variables and results
y = df_sfh_zip['PRICE IN K']
X = df_sfh_zip.drop(['PRICE IN K'], axis=1)

# scaling the data
X = scale(X)

In [96]:
# simple linear regression 
# y = y.values.reshape(-1, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.6933731247203296
Root Mean Squared Error: 1.469838707692063e+17


In [97]:
# linear regression with cross validation
cv_results = cross_val_score(reg, X, y, cv=10)
print(cv_results)
print(np.mean(cv_results))
y_pred = cross_val_predict(reg, X, y, cv=10)
accuracy = r2_score(y, y_pred) 
print('R^2: {}'.format(accuracy))
rmse = np.sqrt(mean_squared_error(y, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

[ -3.74923205e+29  -1.01251354e+31  -1.33612161e+31  -1.43063117e+31
  -6.24473835e+30  -7.21138130e+29  -3.72849684e+29  -1.15509617e+28
  -1.17827506e+28  -2.54521437e+26]
-4.55299007912e+30
R^2: -3.781323616139723e+27
Root Mean Squared Error: 8.20987132348122e+16


In [98]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7086539026074947
Root Mean Squared Error: 17849026.564562257


In [99]:
#lasso regression
lasso=Lasso(alpha=0.01, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7081958763028959
Root Mean Squared Error: 612.7824776902137


In [100]:
# Hyperparameter tuning using GridSearchCV
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)





Root Mean Squared Error: 612.9650464150894
0.696348072134
1.0


In [101]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.821793869774099e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.839558056927951e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.857291395386022e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.832229234257438e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.872957103900874e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.857823434294405e-21 / 1.1102230246251565e-16


Root Mean Squared Error: 615.1927202079407
0.695037334266
20.0
