In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import RANSACRegressor

In [2]:
df9 = pd.read_csv("df_9.csv")
test_ids = df9.ID.iloc[7000:].values # save IDs for later output
df9.drop(["Unnamed: 0", "ID", 'price', 'host_has_profile_pic_t','host_identity_verified_t'], axis=1, inplace=True)
y_train = df9['log_price'].iloc[:7000].values
#y_test = np.zeros(3000)
X_train = df9.drop(['log_price'], axis=1).iloc[:7000].values
X_test = df9.drop(['log_price'], axis=1).iloc[7000:].values

#### Linear Regression

In [None]:
pipe_lr = make_pipeline(StandardScaler(), LinearRegression())

pipe_lr.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_lr, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_lr.fit(X_train, y_train)


y_pred = pipe_lr.predict(X_test)
y_pred_dollar = np.exp(y_pred)

In [None]:
pipe_pca_lr = make_pipeline(StandardScaler(), 
                            PCA(n_components=0.9, svd_solver='full'),
                            LinearRegression())

pipe_pca_lr.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_pca_lr, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_pca_lr.fit(X_train, y_train)


y_pred = pipe_pca_lr.predict(X_test)
y_pred_dollar = np.exp(y_pred)

#### Ridge Regression

In [None]:
pipe_ridge = make_pipeline(StandardScaler(), Ridge())

pipe_ridge.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_ridge, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_ridge.fit(X_train, y_train)


y_pred = pipe_ridge.predict(X_test)
y_pred_dollar = np.exp(y_pred)

In [None]:
param_grid = {'ridge__alpha': [x for x in np.linspace(0.1, 1, num = 10)]}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = pipe_ridge, 
                           param_grid = param_grid, 
                           cv = 10, 
                           n_jobs = -1,
                           scoring = 'neg_mean_squared_error')

# Fit the random search model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(best_model)
print(best_params)

In [None]:
pipe_pca_ridge = make_pipeline(StandardScaler(), 
                            PCA(n_components=0.9, svd_solver='full'),
                            Ridge())

pipe_pca_ridge.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_pca_ridge, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_pca_ridge.fit(X_train, y_train)


y_pred = pipe_pca_ridge.predict(X_test)
y_pred_dollar = np.exp(y_pred)

#### LASSO Regression

In [None]:
pipe_lasso = make_pipeline(StandardScaler(), Lasso())

pipe_lasso.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_lasso, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_lasso.fit(X_train, y_train)


y_pred = pipe_lasso.predict(X_test)
y_pred_dollar = np.exp(y_pred)

In [None]:
param_grid = {'lasso__alpha': [x for x in np.linspace(0.1, 1, num = 10)]}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = pipe_lasso, 
                           param_grid = param_grid, 
                           cv = 10, 
                           n_jobs = -1,
                           scoring = 'neg_mean_squared_error')

# Fit the random search model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(best_model)
print(best_params)

In [None]:
pipe_lasso = make_pipeline(StandardScaler(), Lasso(alpha=0.1))

pipe_lasso.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_lasso, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_lasso.fit(X_train, y_train)


y_pred = pipe_lasso.predict(X_test)
y_pred_dollar = np.exp(y_pred)

In [None]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("LASSOPredictions.csv", index=False, header=True)

In [None]:
pipe_pca_lasso = make_pipeline(StandardScaler(), 
                            PCA(n_components=0.9, svd_solver='full'),
                            Lasso(alpha=0.1))

pipe_pca_lasso.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_pca_lasso, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_pca_lasso.fit(X_train, y_train)


y_pred = pipe_pca_lasso.predict(X_test)
y_pred_dollar = np.exp(y_pred)

In [None]:
pipe_lassocv = make_pipeline(StandardScaler(), LassoCV(cv=10))

pipe_lassocv.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_lassocv, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_lassocv.fit(X_train, y_train)


y_pred = pipe_lassocv.predict(X_test)
y_pred_dollar = np.exp(y_pred)

In [None]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("LASSOPredictions.csv", index=False, header=True)

In [None]:
lasso = Lasso()

model = lasso.fit(X_train, y_train)

model.coef_

In [None]:
param_grid = {'lasso__alpha': [x for x in np.linspace(0.1, 1, num = 10)]}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = pipe_ridge, 
                           param_grid = param_grid, 
                           cv = 10, 
                           n_jobs = -1,
                           scoring = 'neg_mean_squared_error')

# Fit the random search model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(best_model)
print(best_params)

alpha of 1.0 was already used in the default version.

#### ElasticNet Regression

In [None]:
pipe_elasticnet = make_pipeline(StandardScaler(), ElasticNet(alpha=1.0, l1_ratio=0.5))

pipe_elasticnet.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_elasticnet, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_elasticnet.fit(X_train, y_train)


y_pred = pipe_elasticnet.predict(X_test)
y_pred_dollar = np.exp(y_pred)

In [None]:
# define grid
grid = dict()
grid['elasticnet__alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]
grid['elasticnet__l1_ratio'] = np.arange(0, 1, 0.01)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = pipe_elasticnet, 
                           param_grid = grid, 
                           cv = 10, 
                           n_jobs = -1,
                           scoring = 'neg_mean_squared_error')

# Fit the random search model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(best_model)
print(best_params)

In [None]:
pipe_elasticnet = make_pipeline(StandardScaler(), ElasticNet(alpha=0.001, l1_ratio=0.69))

pipe_elasticnet.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_elasticnet, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_elasticnet.fit(X_train, y_train)


y_pred = pipe_elasticnet.predict(X_test)
y_pred_dollar = np.exp(y_pred)

In [None]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("ElasticNetPredictions.csv", index=False, header=True)

#### RANSAC

In [3]:
pipe_ransac = make_pipeline(StandardScaler(), RANSACRegressor(LinearRegression(),
                                                                 max_trials=100,
                                                                 min_samples=50,
                                                                 residual_threshold=5.0,
                                                                 random_state=42))

pipe_ransac.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_ransac, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_ransac.fit(X_train, y_train)


y_pred = pipe_ransac.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.45998035190695036


In [4]:
pipe_ransac = make_pipeline(StandardScaler(), RANSACRegressor(LinearRegression(),
                                                                 max_trials=100,
                                                                 min_samples=50,
                                                                 residual_threshold=4.0,
                                                                 random_state=42))

pipe_ransac.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_ransac, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_ransac.fit(X_train, y_train)


y_pred = pipe_ransac.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.46028860227413426


In [5]:
pipe_ransac = make_pipeline(StandardScaler(), RANSACRegressor(LinearRegression(),
                                                                 max_trials=100,
                                                                 min_samples=50,
                                                                 residual_threshold=3.0,
                                                                 random_state=42))

pipe_ransac.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_ransac, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_ransac.fit(X_train, y_train)


y_pred = pipe_ransac.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.46008750008114485


In [6]:
pipe_ransac = make_pipeline(StandardScaler(), RANSACRegressor(LinearRegression(),
                                                                 max_trials=100,
                                                                 min_samples=50,
                                                                 residual_threshold=2.0,
                                                                 random_state=42))

pipe_ransac.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_ransac, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_ransac.fit(X_train, y_train)


y_pred = pipe_ransac.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.46301135888196304


In [7]:
pipe_ransac = make_pipeline(StandardScaler(), RANSACRegressor(LinearRegression(),
                                                                 max_trials=100,
                                                                 min_samples=50,
                                                                 residual_threshold=6.0,
                                                                 random_state=42))

pipe_ransac.fit(X_train, y_train)

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(pipe_ransac, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

pipe_ransac.fit(X_train, y_train)


y_pred = pipe_ransac.predict(X_test)
y_pred_dollar = np.exp(y_pred)

CV Root Mean Squared Error: 0.4568717355428342


In [8]:
# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RANSACPredictions.csv", index=False, header=True)