In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import  GradientBoostingRegressor,RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df = pd.read_csv('data/merged_df.csv')
df = df.drop(['Unnamed: 0'],axis=1)

In [3]:
for c in df.columns:
    col_type = df[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        df[c] = le.fit_transform(df[c])

x = df.drop(['_SALE_PRICE_'],axis=1)
y=df['_SALE_PRICE_'].astype('int64')


In [17]:
RFR=RandomForestRegressor(n_estimators=100, max_depth=5,random_state=0)
rmse = np.sqrt(-cross_val_score(RFR, x, y, scoring="neg_mean_squared_error", cv = 3,n_jobs=1))
print("RMSE for random forest: " + str(rmse.mean()))

KeyboardInterrupt: 

In [None]:
KRR = KernelRidge(alpha=1.0,kernel="polynomial")
rmse= np.sqrt(-cross_val_score(KRR, x, y, scoring="neg_mean_squared_error", cv = 3))
print("RMSE for KRR: " + str(rmse.mean()))

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'mse'},
    'num_leaves': 10,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
#Accuracy may be bad since you didn't set num_leaves and 2^max_depth > num_leaves
LGB = lgb.LGBMRegressor(params)
rmse= np.sqrt(-cross_val_score(LGB, x, y, scoring="neg_mean_squared_error", cv = 3))
print("RMSE for lgbm: " + str(rmse.mean()))

In [19]:
#With huber loss that makes it robust to outliers
GBoost = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
rmse = np.sqrt(-cross_val_score(GBoost, x, y, scoring="neg_mean_squared_error", cv = 3))
print("RMSE for GradientBoostingRegressor: " + str(rmse.mean()))

RMSE for GradientBoostingRegressor: 0.646569376592


In [20]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [21]:
averaged_models = AveragingModels(models = (LGB, GBoost, RFR,KRR))

rmse = np.sqrt(-cross_val_score(averaged_models, x, y, scoring="neg_mean_squared_error", cv = 3))
print("RMSE for averaged models: " + str(rmse.mean()))

KeyboardInterrupt: 