In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import  GradientBoostingRegressor,RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df = pd.read_csv('data/merged_df.csv')
df = df.drop(['Unnamed: 0'],axis=1)

In [3]:
x = df.drop(['_SALE_PRICE_'],axis=1)
y=df['_SALE_PRICE_'].astype('int64')

In [4]:
RFR=RandomForestRegressor(n_estimators=100, max_depth=5,random_state=0)
rmse = np.sqrt(-cross_val_score(RFR, x, y, scoring="neg_mean_squared_error", cv = 3,n_jobs=1))
print("RMSE for random forest: " + str(rmse.mean()))

RMSE for random forest: 0.645430454884


In [5]:
LGB = lgb.LGBMRegressor(objective= 'regression',min_data_in_leaf=40,n_estimator=1000,
                        num_leaves= 5,metric='mse',learning_rate=0.01,
                        bagging_fraction=0.8,feature_fraction=0.8)
rmse= np.sqrt(-cross_val_score(LGB, x, y, scoring="neg_mean_squared_error", cv = 3))
print("RMSE for lgbm: " + str(rmse.mean()))

RMSE for lgbm: 0.713403879191


In [6]:
GBoost = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
rmse = np.sqrt(-cross_val_score(GBoost, x, y, scoring="neg_mean_squared_error", cv = 3))
print("RMSE for GradientBoostingRegressor: " + str(rmse.mean()))

RMSE for GradientBoostingRegressor: 0.718573909773


In [7]:
#Memory consumption problem -> Need to much ram because of the number of features
#KRR = KernelRidge(alpha=1.0,kernel="polynomial")
#score = cross_val_score(KRR, x, y, scoring="neg_mean_squared_error", cv = 3)
#rmse= np.sqrt(-score)
#print("RMSE for KRR: " + str(rmse.mean()))

In [11]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf',max_iter=1500,gamma=0.1)
score = cross_val_score(svr, x, y, scoring="neg_mean_squared_error", cv = 3)
rmse= np.sqrt(-score)
print("RMSE for SVR: " + str(rmse.mean()))

RMSE for SVR: 0.749965690337


In [12]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [13]:
#DISCARDING THE LEAST SUCCESSFUL MODEL
averaged_models = AveragingModels(models = (LGB, GBoost, RFR))

rmse = np.sqrt(-cross_val_score(averaged_models, x, y, scoring="neg_mean_squared_error", cv = 3))
print("RMSE for averaged models: " + str(rmse.mean()))

RMSE for averaged models: 0.675592377337


## Performance Recap
Overall, random forest provides the best performance, while both boosted trees algorithms are performing poorly. The SVR might need more iterations and a better parameter tuning to get the best out of it.
Even though I used cross validation, I think a higher number of folds would have provided more accurate assessment of each model. However, due to computation time, I restrained the number to 3.
I also tried to average models to see if it could actually improve the results of the random forest, but the boosted trees are performing too poorly to help.

I would have like to also try a kernel ridge regression, but the memory consumption is way too high for my computer because of the high number of features.

Having tested on processing for the dataset, some little tweaks have a big impact on the performance: for instance, keeping the rows that have a year_built = 0 provides great improvements to the results for the boosted trees. However, I tried to have a coherent approach to the dataset, and not tailor it to get the best results to avoid skewing the results.