In [239]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import Imputer, scale
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score

# Preprocessing the Data

In [217]:
filename = '../Data/cleaned_data.csv'
df = pd.read_csv(filename)
df = df.drop('Unnamed: 0', axis=1)

In [218]:
df.head()

Unnamed: 0,SOLD DATE,PROPERTY TYPE,ADDRESS,CITY,ZIP,PRICE IN K,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,$/SQUARE FEET,HOA/MONTH,LATITUDE,LONGITUDE,COUNTY
0,,SFH,1123 Brunswick St,DALY CITY,94014,100.5,2.0,1.0,1030.0,2500.0,1916.0,97.572816,,37.705789,-122.45648,SAN MATEO
1,,SFH,17382 Via La Jolla,SAN LORENZO,94580,101.0,3.0,2.0,1229.0,5200.0,1951.0,82.180635,,37.66581,-122.135079,ALAMEDA
2,,SFH,888 Palm St,SAN JOSE,95110,102.0,2.0,1.0,884.0,6750.0,1913.0,115.384615,,37.319633,-121.886415,SANTA CLARA
3,,Condo,1750 Halford Ave Apt 105,SANTA CLARA,95051,102.0,1.0,1.0,678.0,,1970.0,150.442478,,37.355715,-121.998715,SANTA CLARA
4,,SFH,5502 Vallejo St,OAKLAND,94608,102.5,2.0,1.0,657.0,4000.0,1908.0,156.012177,,37.838201,-122.284831,ALAMEDA


In [219]:
df_ml = df.drop(['SOLD DATE', 'ADDRESS', 'LATITUDE', 'LONGITUDE', 'COUNTY', '$/SQUARE FEET'], axis=1)

In [220]:
#seperate property type
df_sfh = df_ml.loc[df_ml['PROPERTY TYPE'] == 'SFH']
df_th = df_ml.loc[df_ml['PROPERTY TYPE'] == 'TH']
df_condo = df_ml.loc[df_ml['PROPERTY TYPE'] == 'Condo']

df_sfh = df_sfh.drop(['PROPERTY TYPE'], axis=1)
df_th = df_th.drop(['PROPERTY TYPE'], axis=1)
df_condo = df_condo.drop(['PROPERTY TYPE'], axis=1)

In [221]:
#delete some columns are not available for the specific property type
df_sfh = df_sfh.drop(['HOA/MONTH'], axis=1)
df_th = df_th.drop(['LOT SIZE'], axis=1)
df_condo = df_condo.drop(['LOT SIZE'], axis=1)

In [222]:
df_sfh.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27089 entries, 0 to 37758
Data columns (total 8 columns):
CITY           27089 non-null object
ZIP            27088 non-null object
PRICE IN K     27089 non-null float64
BEDS           27007 non-null float64
BATHS          27064 non-null float64
SQUARE FEET    27089 non-null float64
LOT SIZE       26994 non-null float64
YEAR BUILT     27042 non-null float64
dtypes: float64(6), object(2)
memory usage: 1.9+ MB


In [223]:
#encoding dummy variables
df_sfh = pd.get_dummies(df_sfh)
df_th = pd.get_dummies(df_th)
df_condo = pd.get_dummies(df_condo)

In [224]:
df_sfh.head()

Unnamed: 0,PRICE IN K,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,CITY_ALAMEDA,CITY_ALBANY,CITY_ATHERTON,CITY_BELMONT,...,ZIP_95131,ZIP_95132,ZIP_95133,ZIP_95135,ZIP_95136,ZIP_95138,ZIP_95139,ZIP_95140,ZIP_95148,ZIP_95391
0,100.5,2.0,1.0,1030.0,2500.0,1916.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,101.0,3.0,2.0,1229.0,5200.0,1951.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,102.0,2.0,1.0,884.0,6750.0,1913.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,102.5,2.0,1.0,657.0,4000.0,1908.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,102.5,3.0,1.0,928.0,4500.0,1906.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [225]:
#split dataset into feature variables and results
y = df_sfh['PRICE IN K']
X = df_sfh.drop(['PRICE IN K'], axis=1)

#input missing data
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)

In [226]:
# scaling the data
X_scaled = scale(X)

# Regression

In [227]:
# simple linear regression 
y = y.values.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7331920523462573
Root Mean Squared Error: 669.6803149472685


In [228]:
# linear regression with cross validation
cv_results = cross_val_score(reg, X, y, cv=5)
print(cv_results)
print(np.mean(cv_results))
y_pred = cross_val_predict(reg, X, y, cv=5)
accuracy = r2_score(y, y_pred) 
print('R^2: {}'.format(accuracy))
rmse = np.sqrt(mean_squared_error(y, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

[ -9.24715784 -31.48822244 -15.38919091  -7.30777586  -0.30205532]
-12.746880473
R^2: 0.2592439939679615
Root Mean Squared Error: 1148.5610550174185


In [248]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7331683843683163
Root Mean Squared Error: 670.2322970353064


In [230]:
#lasso regression
lasso=Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7216621974532262
Root Mean Squared Error: 676.029867500091


In [245]:
alphas = np.array([0.0001, 0.001, 0.01, 0.1, 1, 10])

In [249]:
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=5)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)



Root Mean Squared Error: 667.7439195145789
0.725191595182
0.1


In [247]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=5)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.411719304643084e-22 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 2.140646067611507e-22 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 1.005633900911235e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 3.9404408581589316e-22 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.315884842052138e-22 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 4.38564912485508e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result 

Root Mean Squared Error: 668.3599355143967
0.725071868198
1.0


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 2.0072513852220054e-18 / 1.1102230246251565e-16
