In [325]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import Imputer, scale
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score

# Preprocessing the Data

In [326]:
filename = 'Preprocessing Data/cleaned_data.csv'
df = pd.read_csv(filename)

In [327]:
#seperate property type
df_sfh = df.loc[df['PROPERTY TYPE'] == 'SFH']
df_th = df.loc[df['PROPERTY TYPE'] == 'TH']
df_condo = df.loc[df['PROPERTY TYPE'] == 'Condo']

df_sfh = df_sfh.drop(['PROPERTY TYPE'], axis=1)
df_th = df_th.drop(['PROPERTY TYPE'], axis=1)
df_condo = df_condo.drop(['PROPERTY TYPE'], axis=1)

In [328]:
#delete some columns are not available for the specific property type
# df_sfh = df_sfh.drop(['HOA/MONTH'], axis=1)
df_th = df_th.drop(['LOT SIZE'], axis=1)
df_condo = df_condo.drop(['LOT SIZE'], axis=1)

In [329]:
df_sfh.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26792 entries, 5517 to 32308
Data columns (total 13 columns):
Unnamed: 0     26792 non-null int64
CITY           26792 non-null object
ZIP            26792 non-null object
PRICE IN K     26792 non-null float64
BEDS           26792 non-null float64
BATHS          26792 non-null float64
SQUARE FEET    26792 non-null float64
LOT SIZE       26792 non-null float64
YEAR BUILT     26792 non-null float64
HOA/MONTH      26792 non-null float64
COUNTY         26792 non-null object
LOT            26792 non-null int64
HOA            26792 non-null int64
dtypes: float64(7), int64(3), object(3)
memory usage: 2.9+ MB


In [330]:
#encoding dummy variables
df_sfh = pd.get_dummies(df_sfh)
df_th = pd.get_dummies(df_th)
df_condo = pd.get_dummies(df_condo)

In [331]:
df_sfh.head()

Unnamed: 0.1,Unnamed: 0,PRICE IN K,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH,LOT,HOA,...,ZIP_95135,ZIP_95136,ZIP_95138,ZIP_95139,ZIP_95148,ZIP_95391,COUNTY_ALAMEDA,COUNTY_SAN FRANCISCO,COUNTY_SAN MATEO,COUNTY_SANTA CLARA
5517,0,100.5,2.0,1.0,1030.0,2500.0,1916.0,0.0,1,0,...,0,0,0,0,0,0,0,0,1,0
5518,1,101.0,3.0,2.0,1229.0,5200.0,1951.0,0.0,1,0,...,0,0,0,0,0,0,1,0,0,0
5519,2,102.0,2.0,1.0,884.0,6750.0,1913.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,1
5520,3,102.5,2.0,1.0,657.0,4000.0,1908.0,0.0,1,0,...,0,0,0,0,0,0,1,0,0,0
5521,4,102.5,3.0,1.0,928.0,4500.0,1906.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [332]:
#split dataset into feature variables and results
y = df_sfh['PRICE IN K']
X = df_sfh.drop(['PRICE IN K'], axis=1)

# Regression

In [333]:
# scaling the data
X = scale(X)

In [334]:
# simple linear regression 
# y = y.values.reshape(-1, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7307249908804581
Root Mean Squared Error: 7.124299294862939e+16


In [335]:
# linear regression with cross validation
cv_results = cross_val_score(reg, X, y, cv=10)
print(cv_results)
print(np.mean(cv_results))
y_pred = cross_val_predict(reg, X, y, cv=10)
accuracy = r2_score(y, y_pred) 
print('R^2: {}'.format(accuracy))
rmse = np.sqrt(mean_squared_error(y, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

[ -1.82989413e+29  -6.40433249e+30  -6.25153579e+30  -2.70576704e+30
  -3.46238651e+30  -3.77602169e+29  -7.31147906e+29  -6.41763876e+28
  -6.21221678e+27  -8.67340611e+23]
-2.01861507774e+30
R^2: -1.9244506300119318e+27
Root Mean Squared Error: 5.8569000653268136e+16


In [336]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7363947393961436
Root Mean Squared Error: 22750434.78905653


In [337]:
#lasso regression
lasso=Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7266607922906969
Root Mean Squared Error: 565.3892453531059


In [338]:
alphas = np.array([1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20])

In [339]:
# Hyperparameter tuning using GridSearchCV
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=5)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)



Root Mean Squared Error: 560.0389679176112
0.726138728694
1.0


In [340]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=5)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.0373469594212766e-17 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 6.491282730842158e-17 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.9079433586840787e-17 / 1.1102230246251565e-16


Root Mean Squared Error: 560.781989710632
0.725523178187
20.0
