In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import Imputer, scale
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score

In [151]:
filename = 'Preprocessing Data/cleaned_data.csv'
df = pd.read_csv(filename)

In [152]:
df_county = df.drop(['CITY', 'ZIP'], axis=1)
df_city = df.drop(['COUNTY', 'ZIP'], axis=1)
df_zip = df.drop(['COUNTY', 'CITY'], axis=1)

In [153]:
#encoding dummy variables
df_county = pd.get_dummies(df_county)
df_city = pd.get_dummies(df_city)
df_zip = pd.get_dummies(df_zip)

# Regression without city and zip

In [154]:
#split dataset into feature variables and results
y = df_county['PRICE IN K']
X = df_county.drop(['PRICE IN K'], axis=1)

In [155]:
# scaling the data
X = scale(X)

In [165]:
# simple linear regression 
# y = y.values.reshape(-1, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.6557767921447132
Root Mean Squared Error: 692.4245632570857


In [172]:
# linear regression with cross validation
cv_results = cross_val_score(reg, X, y, cv=10)
print(cv_results)
print(np.mean(cv_results))
y_pred = cross_val_predict(reg, X, y, cv=10)
accuracy = r2_score(y, y_pred) 
print('R^2: {}'.format(accuracy))
rmse = np.sqrt(mean_squared_error(y, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

[ -2.51233043e+24   2.86670364e-01  -3.02415761e+01  -7.91914694e+01
  -7.44202407e+01  -6.61955788e+01  -5.28855230e+01  -3.31843688e+01
  -7.36402812e+00  -5.19379920e-01]
-2.51233042515e+23
R^2: -2.7019657101971852e+22
Root Mean Squared Error: 205484181816460.22


In [173]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.6557987049677543
Root Mean Squared Error: 692.5222777150144


In [174]:
#lasso regression
lasso=Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.6519285030152393
Root Mean Squared Error: 697.5893229171953


In [175]:
alphas = np.array([1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20])

In [176]:
# Hyperparameter tuning using GridSearchCV
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)





Root Mean Squared Error: 692.5872500830679
0.660649744192
1.0


In [177]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 7.309805191690452e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 7.312806581732561e-21 / 1.1102230246251565e-16
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 7.302288238420856e-21 / 1.1102230246251565e-16


Root Mean Squared Error: 692.5132386280366
0.660604194051
20.0
