In [None]:
# Simple lgbm without GSV
# V3 with GSV
# V4 with the final values of hyperparams
# V5 taking the mean of 5 predictions
# V6 with ridge regression
# V8 with make_score
# V9 with helper for GridSearch
# V10 with Ridge
# V11 withput ridge
# V12 simple GS

In [None]:
%env JOBLIB_TEMP_FOLDER=/tmp # prevent memory issues
%matplotlib inline
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from fastai.structured import *
from fastai.column_data import *
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

data = pd.read_csv('../input/training_data.csv')
X_test = pd.read_csv('../input/test_data.csv')
sub = pd.read_csv('../input/sample_submission.csv')

In [None]:
X_test.drop(X_test.columns[[0]],axis=1,inplace=True)

In [None]:
# Removing highly correlated data
#data = data.drop(data.columns[[5,6,21,25,30]],axis=1)
#X_test = X_test.drop(X_test.columns[[6,7,22,26,31]],axis=1)

X = data.iloc[:,1:-1]
y = data.iloc[:,-1]

#X_test = test
#y_test = np.zeros(test.shape[1])

In [None]:
X_train, X_valid , y_train, y_valid = train_test_split(X,y,test_size=0.2)

In [None]:
lgbm_train = lgb.Dataset(X_train,y_train) # Preparing data for lgb
lgbm_valid = lgb.Dataset(X_valid,y_valid,reference=lgbm_train)

In [None]:
params = {'boosting_type': 'gbdt',                     # Lgb parameters
    'objective': 'regression',
    'metric': 'mape',
    'learning_rate': 0.06, # updated after GS
    'feature_fraction': 0.1, # updated after GS
    'bagging_fraction': 0.5, # updated after GS
    'bagging_freq': 5, # updated after GS
    'max_depth' : 2, # updated after GS
    'verbose': 0,
    'num_leaves' : 4 # updates after GS
         }

In [None]:
gbm = lgb.train(
                params,
               lgbm_train,
               valid_sets=lgbm_valid,
                num_boost_round=2000,
                early_stopping_rounds=50 # stop if the results do not improve in 10 rounds
                #verbose_eval=False # verbose = 0
               )
print('The best Iteration is',gbm.best_iteration)

In [None]:
GS = False # switch GS on and off

In [None]:
def meap_(y_true,y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = make_scorer(meap_,greater_is_better=False)

if GS == True:
    grid_params = {'max_depth' : [1,2,100],
                   'num_leaves' : [2,3,4,5,10,50],
                    'learning_rate': [0.055,0.06,0.061],
                    'feature_fraction': [0.1,0.5,0.9],
                    'bagging_fraction': [0.1,0.5,0.8],
                    'bagging_freq' : [3,5,7]
    }


    mdl = lgb.LGBMRegressor(boosting_type= params['boosting_type'], # these will be commented when the parameter is in grid_params, uncomment when you get the best results.
                           objective = params['objective'],
                            metric = params['metric'],
                            #num_leaves = params['num_leaves'], 
                            #learning_rate = params['learning_rate'],
                            #feature_fraction = params['feature_fraction'],
                            #bagging_fraction = params['bagging_fraction'],
                            #bagging_freq = params['bagging_freq']
                           )

    grid = GridSearchCV(mdl,
                       grid_params,
                        cv = 5,
                        scoring = mape
                       )

    grid.fit(X_train,y_train)

    print(grid.best_params_)

In [None]:
# creating a model that takes the mean prediction
pred_buff = []
n_itr = 5
for i in range(n_itr):
    X_train, X_valid , y_train, y_valid = train_test_split(X,y,test_size=0.2,random_state=i) # random state is the number in the iteration
    lgbm_train = lgb.Dataset(X_train,y_train) # Preparing data for lgb
    lgbm_valid = lgb.Dataset(X_valid,y_valid,reference=lgbm_train) # Preparing data for lgb
    
    gbm_itr = lgb.train(
                params,
               lgbm_train,
               valid_sets=lgbm_valid,
                num_boost_round=150,
                early_stopping_rounds=5, # stop if the results do not improve in 5 rounds
                verbose_eval=False # verbose = 0
               )
    
    y_pred = gbm.predict(X_test,
                     num_iteration=gbm.best_iteration) # run the number of iterations for the best number
    pred_buff.append(y_pred)
#pred = np.mean(pred_buff,axis=0)

In [None]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    
    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

In [None]:
GS = True
if GS == True:
    models1 = {
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'RandomForestRegressor': RandomForestRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
#        'SVR': SVR()
    }

    params1 = {
        'ExtraTreesRegressor': { 'n_estimators': [16, 32] },
        'RandomForestRegressor': { 'n_estimators': [16, 32] },
        'AdaBoostRegressor': { 'n_estimators': [16, 32] },
        'GradientBoostingRegressor': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
 #       'SVR': [
 #           {'kernel': ['linear'], 'C': [1, 10]},
 #           {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
 #       ]
    }

    helper1 = EstimatorSelectionHelper(models1, params1)
    helper1.fit(X_train, y_train, scoring='neg_mean_absolute_error', n_jobs=2)

In [None]:
if GS == True:
    from sklearn.linear_model import Ridge
    par = {
        #'alpha' : [7000000,8000000]
        'alpha' : list(np.arange(7547000,7548000,10))
    }
    mdl = Ridge()
    gs = GridSearchCV(mdl,
                     par,
                     cv = 2,
                     scoring=mape)
    gs.fit(X_train,y_train)
    print(gs.best_params_)

In [None]:
Ridge = False

In [None]:
if Ridge == True:
    ridge = Ridge(alpha=7547000)
    ridge.fit(X_train,y_train)

    pred_buff.append(ridge.predict(X_test))
    pred = np.mean(pred_buff,axis=0)

In [None]:
pred = np.mean(pred_buff,axis=0)

In [None]:
sub['shares'] = pred

sub.to_csv('submission.csv',index=False)