In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
df9 = pd.read_csv("df_9.csv")
test_ids = df9.ID.iloc[7000:].values # save IDs for later output
df9.drop(["Unnamed: 0", "ID", 'price', 'host_has_profile_pic_t','host_identity_verified_t'], axis=1, inplace=True)
y_train = df9['log_price'].iloc[:7000].values
#y_test = np.zeros(3000)
X_train = df9.drop(['log_price'], axis=1).iloc[:7000].values
X_test = df9.drop(['log_price'], axis=1).iloc[7000:].values

#### Linear Regression

In [3]:
pipe_lr = make_pipeline(StandardScaler(), LinearRegression())

pipe_lr.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_lr, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_lr.fit(X_train, y_train)


y_pred = pipe_lr.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.45628252287408333


#### Ridge Regression

In [4]:
pipe_ridge = make_pipeline(StandardScaler(), Ridge())

pipe_ridge.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_ridge, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_ridge.fit(X_train, y_train)


y_pred = pipe_ridge.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.45627980401135004


In [8]:
param_grid = {'ridge__alpha': [x for x in np.linspace(0.1, 1, num = 10)]}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = pipe_ridge, 
                           param_grid = param_grid, 
                           cv = 10, 
                           n_jobs = -1,
                           scoring = 'neg_mean_squared_error')

# Fit the random search model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(best_model)
print(best_params)

Pipeline(steps=[('standardscaler', StandardScaler()), ('ridge', Ridge())])
{'ridge__alpha': 1.0}


#### LASSO Regression

In [9]:
pipe_lasso = make_pipeline(StandardScaler(), Lasso())

pipe_lasso.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_lasso, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_lasso.fit(X_train, y_train)


y_pred = pipe_lasso.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.6913022197825175


In [17]:
param_grid = {'lasso__alpha': [x for x in np.linspace(0.1, 1, num = 10)]}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = pipe_lasso, 
                           param_grid = param_grid, 
                           cv = 10, 
                           n_jobs = -1,
                           scoring = 'neg_mean_squared_error')

# Fit the random search model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(best_model)
print(best_params)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('lasso', Lasso(alpha=0.1))])
{'lasso__alpha': 0.1}


In [18]:
pipe_lasso = make_pipeline(StandardScaler(), Lasso(alpha=0.1))

pipe_lasso.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_lasso, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_lasso.fit(X_train, y_train)


y_pred = pipe_lasso.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.5189657511054931


In [19]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("LASSOPredictions.csv", index=False, header=True)

In [15]:
pipe_lassocv = make_pipeline(StandardScaler(), LassoCV(cv=10))

pipe_lassocv.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_lassocv, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_lassocv.fit(X_train, y_train)


y_pred = pipe_lassocv.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.4561696322568404


In [16]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("LASSOPredictions.csv", index=False, header=True)

In [12]:
lasso = Lasso()

model = lasso.fit(X_train, y_train)

model.coef_

array([ 1.43542109e-06, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  4.52038621e-05,
        0.00000000e+00,  2.69686948e-03,  0.00000000e+00,  6.91403122e-04,
       -4.84606422e-04, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  4.16069457e-03,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00])

In [8]:
param_grid = {'lasso__alpha': [x for x in np.linspace(0.1, 1, num = 10)]}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = pipe_ridge, 
                           param_grid = param_grid, 
                           cv = 10, 
                           n_jobs = -1,
                           scoring = 'neg_mean_squared_error')

# Fit the random search model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(best_model)
print(best_params)

Pipeline(steps=[('standardscaler', StandardScaler()), ('ridge', Ridge())])
{'ridge__alpha': 1.0}


alpha of 1.0 was already used in the default version.

#### ElasticNet Regression