In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import Imputer, scale
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score

In [2]:
filename = 'Preprocessing Data/cleaned_data.csv'
df = pd.read_csv(filename)

In [3]:
df_county = df.drop(['CITY', 'ZIP'], axis=1)
df_city = df.drop(['COUNTY', 'ZIP'], axis=1)
df_zip = df.drop(['COUNTY', 'CITY'], axis=1)

In [4]:
#encoding dummy variables
df_county = pd.get_dummies(df_county)
df_city = pd.get_dummies(df_city)
df_zip = pd.get_dummies(df_zip)

# Regression without city and zip

In [5]:
#split dataset into feature variables and results
y = df_county['PRICE IN K']
X = df_county.drop(['PRICE IN K'], axis=1)

In [6]:
# scaling the data
X = scale(X)

In [7]:
# simple linear regression 
# y = y.values.reshape(-1, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.6557767974001504
Root Mean Squared Error: 692.4245659061581




In [8]:
# linear regression with cross validation
cv_results = cross_val_score(reg, X, y, cv=10)
print(cv_results)
print(np.mean(cv_results))
y_pred = cross_val_predict(reg, X, y, cv=10)
accuracy = r2_score(y, y_pred) 
print('R^2: {}'.format(accuracy))
rmse = np.sqrt(mean_squared_error(y, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

[ -2.51233043e+24   2.86670364e-01  -3.02415761e+01  -7.91914694e+01
  -7.44202407e+01  -6.61955788e+01  -5.28853970e+01  -3.31843688e+01
  -7.36402812e+00  -5.19379920e-01]
-2.51233042515e+23
R^2: -2.7019657101971852e+22
Root Mean Squared Error: 205484181816460.22


In [9]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.6557987049677543
Root Mean Squared Error: 692.5222777150144


In [10]:
#lasso regression
lasso=Lasso(alpha=0.1, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.6519285030152393
Root Mean Squared Error: 697.5893229171953


In [11]:
alphas = 10**np.linspace(10,-2,100)*0.5

In [12]:
# Hyperparameter tuning using GridSearchCV
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)











Root Mean Squared Error: 692.7800762831993
0.660688837214
2.32079441681




In [13]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)

Root Mean Squared Error: 693.0450611266896
0.660772080641
266.83496156


# Regression without county and zip

In [14]:
#split dataset into feature variables and results
y = df_city['PRICE IN K']
X = df_city.drop(['PRICE IN K'], axis=1)

# scaling the data
X = scale(X)

In [15]:
# simple linear regression 
# y = y.values.reshape(-1, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7376249543426322
Root Mean Squared Error: 618.3329302300189


In [16]:
# linear regression with cross validation
cv_results = cross_val_score(reg, X, y, cv=10)
print(cv_results)
print(np.mean(cv_results))
y_pred = cross_val_predict(reg, X, y, cv=10)
accuracy = r2_score(y, y_pred) 
print('R^2: {}'.format(accuracy))
rmse = np.sqrt(mean_squared_error(y, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

[ -6.93041388e+23  -1.14237718e-01  -2.80304148e+22  -5.34264583e+01
  -5.49641321e+01  -4.49808461e+01  -3.09358158e+01  -2.04070339e+01
  -5.86688326e+00  -4.39431521e-01]
-7.21071802552e+22
R^2: -7.465049274600872e+21
Root Mean Squared Error: 108007724934960.52


In [17]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7376836588705602
Root Mean Squared Error: 618.2172404086409


In [18]:
#lasso regression
lasso=Lasso(alpha=0.01, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7375427336625024
Root Mean Squared Error: 618.1076660127587


In [20]:
alphas = 10**np.linspace(10,-2,100)*0.5

In [21]:
# Hyperparameter tuning using GridSearchCV
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)









Root Mean Squared Error: 618.1050832183635
0.74033704442
1.32804389147


In [22]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)

Root Mean Squared Error: 618.2074055779584
0.740402207117
266.83496156


# Regression without county and city

In [23]:
#split dataset into feature variables and results
y = df_zip['PRICE IN K']
X = df_zip.drop(['PRICE IN K'], axis=1)

# scaling the data
X = scale(X)

In [24]:
# simple linear regression 
# y = y.values.reshape(-1, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R^2: {}'.format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7306171458331266
Root Mean Squared Error: 7.784398890751954e+16


In [25]:
# linear regression with cross validation
cv_results = cross_val_score(reg, X, y, cv=10)
print(cv_results)
print(np.mean(cv_results))
y_pred = cross_val_predict(reg, X, y, cv=10)
accuracy = r2_score(y, y_pred) 
print('R^2: {}'.format(accuracy))
rmse = np.sqrt(mean_squared_error(y, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))

[ -2.37589912e+28  -8.20990304e+27  -6.47473897e+29  -1.61485354e+30
  -7.76131866e+30  -1.49610339e+29  -2.10928672e+29  -5.55092963e+28
  -1.06617521e+24  -8.99650454e+24]
-1.04716733555e+30
R^2: -1.5767827507424002e+27
Root Mean Squared Error: 4.963915832321749e+16


In [26]:
#ridge regression
ridge=Ridge(alpha=0.001, normalize=True)
ridge.fit(X_train, y_train)
ridge_pred=ridge.predict(X_test)
print('R^2: {}'.format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7376177736887062
Root Mean Squared Error: 68375.99269360772


In [27]:
#lasso regression
lasso=Lasso(alpha=0.01, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred=lasso.predict(X_test)
print('R^2: {}'.format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print('Root Mean Squared Error: {}'.format(rmse))

R^2: 0.7369600857331012
Root Mean Squared Error: 621.1896557165699


In [None]:
alphas = 10**np.linspace(10,-2,100)*0.5

In [None]:
# Hyperparameter tuning using GridSearchCV
model = Lasso()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)









In [None]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha = alphas), cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: {}'.format(rmse))
print(grid.best_score_)
print(grid.best_estimator_.alpha)