In [1]:
import pandas as pd
import numpy as np
import time_series_module as tsm
import time_series_cross_valid as tscv
import time_series_versioning as tsver
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from importlib import reload
from statsmodels.graphics import tsaplots
from statsmodels.api import tsa

In [2]:
df = pd.read_csv('train.csv')
df['date_time'] = pd.to_datetime(df['date_time'])
date_time_features = tsm.get_date_time_features(df, 'date_time', 
                                                one_hot_encoding = {'hour' : True, 'day' : False, 'month' : False, 'season' : False}, 
                                                hour = True, day = True, month = True, season = False, year = False)

#date_time_features = df[date_time_features_names].copy()
date_time_column = df['date_time'].copy()
df.drop(columns = ['date_time'], inplace = True)

TypeError: 'bool' object is not subscriptable

In [None]:
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [None]:
df

In [None]:
sensors = ['sensor_{}'.format(i) for i in range(1,6)]

In [None]:
f_dict = {}
df = tsm.feature_extractor(features_dict = f_dict, data_frame = df, inplace = True, 
                           STL = [date_time_column, targets + sensors,
                                  True, False, 'additive'])
df.drop(columns = sensors, inplace = True)
#df = tsm.feature_extractor(features_dict = f_dict, data_frame = df, inplace = True, lagged = [list(df.columns), 24, False])

In [None]:
df.dropna(inplace = True)
#date_time_features = date_time_features.loc[list(df.index),:]
#date_time_column = date_time_column[df.index]
#df.index, date_time_column.index, date_time_features.index = [list(range(len(df)))]*3

In [None]:
def get_feature_list(data, date_time_features):
    return [f for f in list(data.columns) if f not in targets and f not in date_time_features and f != 'date_time']

In [None]:
features = get_feature_list(df, date_time_features)
features

In [None]:
data_versions = tsver.DataVersions()

data_versions.push(tsver.nested_data(df, features, targets, targets, [date_time_column, date_time_features]), 
                   key = 'original_data')
data_versions.save_with_pickle('original_data')

In [None]:
rcParams['figure.figsize'] = 20, 8

In [None]:
def mape(y, y_hat):
    return 100*np.abs(y - y_hat) / y

### Ridge regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
class RidgeModel:
    def __init__(self, ridge):
        self.ridge = ridge
    def fit(self, X,Y):
        fitted_model = self.ridge.fit(X, Y)
        self.importances = fitted_model.coef_
        return fitted_model
    def predict(self, X):
        return self.ridge.predict(X)

In [None]:
ridge_horizons = 1
ridge_obj = Ridge(alpha = .001, normalize = True)
ridge_model = RidgeModel(ridge_obj)
ridge_CrossValid_params = {'train_size' : 2000, 'test_size' : 1, 'min_period' : 0, 'step' : 500}
ridge_ForecastModel_params = {'features' : features, 'date_time' : date_time_features,
                              'prior_lag' : 24, 'post_lag' : 0, 'new_index' : True}

In [None]:
ridge_quality = tscv.run_cv(data = df, targets = targets, horizons = ridge_horizons, CrossValid_params = ridge_CrossValid_params, 
            ForecastModel_params = ridge_ForecastModel_params, model = ridge_model, metrics = mape)

In [None]:
tscv.plot_cv_results(quality = ridge_quality, horizons = ridge_horizons, plot_loss = True)

In [None]:
ridge_losses = tscv.get_losses(ridge_quality, ridge_horizons)
for tar in targets:
    print(tar + ' train_loss: ',ridge_losses[tar]['train_loss'].mean())
    print(tar + ' test_loss: ',ridge_losses[tar]['test_loss'].mean())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

In [None]:
ridge_resid = tscv.get_residuals(ridge_quality, ridge_horizons)

In [None]:
ridge_resid['target_benzene']['train_resid'].hist().plot()

In [None]:
ridge_losses['target_benzene']['train_loss'].hist().plot()

In [None]:
ridge_resid[targets[2]]['train_resid'].plot.hist()

In [None]:
ridge_columns = ridge_quality['target_benzene']['forecast_model'][0].features + ridge_quality['target_benzene']['forecast_model'][0].targets
ridge_feature_corr = ridge_quality['target_benzene']['forecast_model'][0].data[ridge_columns].corr()

In [None]:
#ridge_feature_corr.loc[ridge_feature_corr.loc['lag_sensor_2_1'] >= 0.8, ridge_feature_corr.loc['lag_sensor_2_1'] >= 0.8]

In [None]:
ridge_importances_dict = tscv.get_importances(ridge_quality, ridge_horizons)
ridge_imp_features_0 = ridge_importances_dict['target_carbon_monoxide'][0]#[(ridge_importances_dict['target_carbon_monoxide'][0] >= .2) | (ridge_importances_dict['target_carbon_monoxide'][0] <= -0.2)]
ridge_imp_features_1 = ridge_importances_dict['target_benzene'][0]
ridge_imp_features_2 = list(ridge_importances_dict['target_nitrogen_oxides'][0][(ridge_importances_dict['target_nitrogen_oxides'][0] >= 1) | (ridge_importances_dict['target_nitrogen_oxides'][0] <= -1)].index)

In [None]:
ridge_imp_features_1.apply(np.abs).sort_values(ascending = False).head(30)

In [None]:
for ind in ridge_imp_features_1.sort_values(ascending = False).apply(np.abs).index:
    if 'sensor_2' in ind:
        print(ind,ridge_imp_features_0.apply(np.abs)[ind])

In [None]:
sns.pairplot(pd.concat([df[targets[0]].shift(1).dropna(), df[targets[1]]], axis = 1))

### Forward Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
ridge_temp = ridge_quality['target_carbon_monoxide']['forecast_model'][0]

In [None]:
X_temp = ridge_temp.data[ridge_temp.features + ridge_temp.date_time].copy()
Y_temp = ridge_temp.data[ridge_temp.targets].copy()

In [None]:
sfs_cv = tscv.CrossValid(2000, 1)

In [None]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import make_scorer
MSE_scorer = make_scorer(MSE)
sfs = SequentialFeatureSelector(ridge_obj, cv = list(sfs_cv.split(len(Y_temp), step = 500)), 
                                k_features = 20, scoring = MSE_scorer, forward = True)
sfs = sfs.fit(X_temp, Y_temp)

In [None]:
X_temp.shape

In [None]:
sfs.k_feature_names_

In [None]:
ridge_temp.data[list(sfs.k_feature_names_) + ridge_temp.targets].corr()

In [None]:
sfs.k_feature_names_

In [None]:
#('lag_deg_C_1','lag_deg_C_2','lag_deg_C_3','lag_deg_C_4','lag_deg_C_5','lag_relative_humidity_1','lag_relative_humidity_2','lag_relative_humidity_3','lag_relative_humidity_4','lag_relative_humidity_5','lag_sensor_1_5','lag_sensor_2_5','lag_sensor_3_4','lag_sensor_4_4','lag_seasonal_target_carbon_monoxide_3','lag_resid_target_carbon_monoxide_1','lag_resid_target_carbon_monoxide_2','lag_resid_target_carbon_monoxide_3','lag_resid_target_benzene_2','lag_resid_target_benzene_3','lag_resid_target_nitrogen_oxides_1','lag_resid_target_nitrogen_oxides_2','lag_resid_target_nitrogen_oxides_3','lag_resid_target_nitrogen_oxides_5','h_0','h_6','h_8','h_9','h_10','h_12','h_13','h_14','h_15','h_16','h_17','h_18','h_20','h_21','h_23','day')

### Decision Tree

In [None]:
df1 = pd.read_csv('train.csv')
df1['date_time'] = pd.to_datetime(df1['date_time'])
date_time_features1 = tsm.get_date_time_features(df1, 'date_time', 
                                                one_hot_encoding = {'hour' : True, 'day' : True, 'month' : True, 'season' : True}, 
                                                hour = True, day = True, month = True, season = False, year = False)

#date_time_features = df[date_time_features_names].copy()
date_time_column1 = df1['date_time'].copy()
df1.drop(columns = ['date_time'], inplace = True)
f1_dict = {}
df1 = tsm.feature_extractor(features_dict = f1_dict, data_frame = df1, inplace = True, 
                            STL = [date_time_column1, targets + sensors, True, False, 'additive'])
df1.drop(columns = sensors, inplace = True)

In [None]:
features1 = get_feature_list(df1)
features1

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
class DTRegModel:
    def __init__(self, model):
        self.model = model
    def fit(self, X, Y):
        fitted_model = self.model.fit(X, Y)
        self.importances = fitted_model.feature_importances_
        return fitted_model
    def predict(self, X):
        return self.model.predict(X)
    

In [None]:
dtreg_model = DTRegModel(DecisionTreeRegressor(max_depth = 11))
dtreg_horizons = 1
dtreg_CrossValid_params = {'train_size' : 2000, 'test_size' : 1, 'min_period' : 0, 'step' : 500}
dtreg_ForecastModel_params = {'features' : features1, 'date_time' : date_time_features1,
                              'prior_lag' : 24, 'post_lag' : 0, 'new_index' : True}

In [None]:
dtreg_quality = tscv.run_cv(data = df1, targets = targets, horizons = dtreg_horizons, CrossValid_params = dtreg_CrossValid_params, 
            ForecastModel_params = dtreg_ForecastModel_params, model = dtreg_model, metrics = mape)

In [None]:
tscv.plot_cv_results(quality = dtreg_quality, horizons = dtreg_horizons, plot_loss = True)

In [None]:
dtreg_importances_dict = tscv.get_importances(dtreg_quality, dtreg_horizons, False)

dtreg_imp_features_0 = list(dtreg_importances_dict['target_carbon_monoxide'][0][(dtreg_importances_dict['target_carbon_monoxide'][0] >= .001) | (dtreg_importances_dict['target_carbon_monoxide'][0] <= -0.001)].index)
dtreg_imp_features_1 = list(dtreg_importances_dict['target_benzene'][0][(dtreg_importances_dict['target_benzene'][0] >= 1) | (dtreg_importances_dict['target_benzene'][0] <= -1)].index)
dtreg_imp_features_2 = list(dtreg_importances_dict['target_nitrogen_oxides'][0][(dtreg_importances_dict['target_nitrogen_oxides'][0] >= 1) | (dtreg_importances_dict['target_nitrogen_oxides'][0] <= -1)].index)

In [None]:
dtreg_losses = tscv.get_losses(dtreg_quality, dtreg_horizons)
for tar in targets:
    print(tar + ' train_loss: ',dtreg_losses[tar]['train_loss'].mean())
    print(tar + ' test_loss: ',dtreg_losses[tar]['test_loss'].mean())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

In [None]:
dtlosses = {}
for tar in targets:
    dtlosses[tar] = {'train' : [], 'test' : []}
tree_height_list = list(range(5,11))
for tree_height in tree_height_list:
    dtreg_model = DTRegModel(DecisionTreeRegressor(max_depth = tree_height))
    dtreg_quality = tscv.run_cv(data = df, targets = targets, horizons = 2, CrossValid_params = dtreg_CrossValid_params, 
            ForecastModel_params = dtreg_ForecastModel_params, model = dtreg_model, metrics = mape)
    dtreg_losses = tscv.get_losses(dtreg_quality, 2)
    for tar in targets:
        dtlosses[tar]['train'].append(dtreg_losses[tar]['train_loss'].mean())
        dtlosses[tar]['test'].append(dtreg_losses[tar]['test_loss'].mean())

In [None]:
for tar in targets:
    plt.plot(tree_height_list, dtlosses[tar]['train'], label = 'train_' + tar)
    plt.plot(tree_height_list, dtlosses[tar]['test'], label = 'test_' + tar)
    plt.legend()
    plt.show()

In [None]:
dtreg_importances_dict['target_benzene'][0].sort_values(ascending = False)

### Random forest

In [None]:
rfdata = pd.read_csv('train.csv')
rfdata['date_time'] = pd.to_datetime(rfdata['date_time'])
rf_date_time_features = tsm.get_date_time_features(rfdata, 'date_time', 
                                                one_hot_encoding = {'hour' : True, 'day' : True, 'month' : True, 'season' : True}, 
                                                hour = True, day = True, month = True, season = False, year = False)

rf_date_time_column = rfdata['date_time'].copy()
rfdata.drop(columns = ['date_time'], inplace = True)

In [None]:
rf_features_dict = {}
rfdata = tsm.feature_extractor(features_dict = rf_features_dict, data_frame = rfdata, inplace = True, 
                            STL = [rf_date_time_column, targets + sensors, True, False, 'additive'])
rfdata.drop(columns = sensors, inplace = True)

In [None]:
rf_features = get_feature_list(rfdata)
rf_features

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
class RFReg_model:
    def __init__(self, rfreg):
        self.rfreg = rfreg
    def fit(self, X, Y):
        Y = np.reshape(Y, (Y.shape[0],))
        fitted_model = self.rfreg.fit(X, Y)
        self.importances = fitted_model.feature_importances_
        return fitted_model
    def predict(self, X):
        return np.reshape(self.rfreg.predict(X), (X.shape[0],1))

In [None]:
rfreg_model = RFReg_model(RandomForestRegressor(n_estimators = 35, max_depth = 5, bootstrap = False))
rfreg_horizons = 1
rfreg_CrossValid_params = {'train_size' : 2000, 'test_size' : 1, 'min_period' : 0, 'step' : 500}
rfreg_ForecastModel_params = {'features' : rf_features + targets, 'date_time' : rf_date_time_features,
                              'prior_lag' : 24, 'post_lag' : 0, 'new_index' : True}

In [None]:
rfreg_quality = tscv.run_cv(data = rfdata, targets = targets, horizons = rfreg_horizons, CrossValid_params = rfreg_CrossValid_params, 
            ForecastModel_params = rfreg_ForecastModel_params, model = rfreg_model, metrics = mape)

In [None]:
tscv.plot_cv_results(quality = rfreg_quality, horizons = rfreg_horizons, plot_loss = True)

In [None]:
rfreg_losses = tscv.get_losses(rfreg_quality, rfreg_horizons)
for tar in targets:
    print(tar + ' train_loss: ',rfreg_losses[tar]['train_loss'].mean())
    print(tar + ' test_loss: ',rfreg_losses[tar]['test_loss'].mean())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

In [None]:
#rfreg_quality['target_benzene']['']

### Xgboost

In [None]:
reload(tsm)

In [None]:
xgb_data = pd.read_csv('train.csv')
xgb_data['date_time'] = pd.to_datetime(xgb_data['date_time'])
xgb_date_time_features = tsm.get_date_time_features(xgb_data, 'date_time', 
                                                one_hot_encoding = {'hour' : False, 'day' : False, 'month' : False, 'season' : False},
                                                hour = True, day = True, month = True, season = True, year = False)

xgb_date_time_column = xgb_data['date_time'].copy()
xgb_data.drop(columns = ['date_time'], inplace = True)

In [None]:
xgb_features_dict = {}
#xgbdata = tsm.feature_extractor(features_dict = xgb_features_dict, data_frame = xgb_data, inplace = True, 
#                            STL = [xgb_date_time_column, targets, True, False, 'additive'])
#xgbdata = tsm.feature_extractor(features_dict = xgb_features_dict, data_frame = xgb_data, inplace = True, 
#                            STL = [xgb_date_time_column, targets, True, False, 'additive'])
#xgbdata.drop(columns = sensors, inplace = True)

In [None]:
xgb_data.drop(columns = ['sensor_3', 'sensor_4'], inplace = True)

In [None]:
xgb_features = get_feature_list(xgb_data, xgb_date_time_features)
xgb_features

In [None]:
from xgboost import XGBRegressor

In [None]:
class XGBRegModel:
    def __init__(self, model, make_importances):
        self.model = model
        self.make_importances = make_importances
    def fit(self, X, Y):
        fitted_model = self.model.fit(X, Y)
        self.importances = fitted_model.feature_importances_ if self.make_importances else None
        return fitted_model
    def predict(self, X):
        return self.model.predict(X)

In [None]:
reload(tscv)

In [None]:
xgb_model = XGBRegModel(XGBRegressor(n_estimators = 10), make_importances = False)#, reg_alpha = 0.01, reg_lambda = 0.01))
xgb_horizons = 1
xgb_CrossValid_params = {'train_size' : 3500, 'test_size' : 1, 'min_period' : 0, 'step' : 500}
xgb_ForecastModel_params = {'features' : xgb_features, 'date_time' : xgb_date_time_features,
                              'prior_lag' : 24, 'post_lag' : 0, 'new_index' : True}

In [None]:
xgb_quality = tscv.run_cv(data = xgb_data, targets = targets, horizons = xgb_horizons, CrossValid_params = xgb_CrossValid_params, 
                          ForecastModel_params = xgb_ForecastModel_params, model = xgb_model, metrics = mape)

In [None]:
xgb_losses = tscv.get_losses(xgb_quality, xgb_horizons)
for tar in targets:
    print(tar + ' train_loss: ',xgb_losses[tar]['train_loss'].mean())
    print(tar + ' test_loss: ',xgb_losses[tar]['test_loss'].mean())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

In [None]:
rcParams['figure.figsize'] = 30, 10

In [None]:
tscv.plot_cv_results(quality = xgb_quality, horizons = xgb_horizons, plot_loss = True)

In [None]:
xgb_losses, xgb_resids = tscv.get_losses(xgb_quality, xgb_horizons), tscv.get_residuals(xgb_quality, xgb_horizons)

In [None]:
xgb_quality[targets[0]]['forecast_model'][0] 

In [None]:
xgb_importances_dict = tscv.get_importances(xgb_quality, xgb_horizons, False)

### LightGBM

In [None]:
from lightgbm import LGBMRegressor

In [None]:
lgb_data = pd.read_csv('train.csv')
lgb_data['date_time'] = pd.to_datetime(lgb_data['date_time'])
lgb_dt_features = tsm.get_date_time_features(df, 'date_time', hour = [True, False], day = [True, False],
                                                month = [True, False], season = [True, False], year = [False])

lgb_date_time_column = lgb_data['date_time'].copy()
lgb_data.drop(columns = ['date_time'], inplace = True)

In [None]:
lgb_features = get_feature_list(lgb_data, lgb_date_time_features)
lgb_features

In [None]:
class LGBRegModel:
    def __init__(self, model, make_importances):
        self.model = model
        self.make_importances = make_importances
    def fit(self, X, Y):
        fitted_model = self.model.fit(X, Y)
        self.importances = fitted_model.feature_importances_ if self.make_importances else None
        return fitted_model
    def predict(self, X):
        return self.model.predict(X)

In [None]:
lgb_model = LGBRegModel(LGBMRegressor(n_estimators = 350, max_depth = 1, boosting_type = 'dart'), make_importances = False)#, reg_alpha = 0.01, reg_lambda = 0.01))
lgb_horizons = 1
lgb_CrossValid_params = {'train_size' : 3500, 'test_size' : 1, 'min_period' : 0, 'step' : 500}
lgb_ForecastModel_params = {'features' : lgb_features, 'date_time' : lgb_date_time_features,
                              'prior_lag' : 24, 'post_lag' : 0, 'new_index' : True}

In [None]:
lgb_quality = tscv.run_cv(data = lgb_data, targets = targets, horizons = lgb_horizons, CrossValid_params = lgb_CrossValid_params, 
                          ForecastModel_params = lgb_ForecastModel_params, model = lgb_model, metrics = mape)

In [None]:
lgb_losses = tscv.get_losses(lgb_quality, lgb_horizons)
for tar in targets:
    print(tar + ' train_loss: ',lgb_losses[tar]['train_loss'].mean())
    print(tar + ' test_loss: ',lgb_losses[tar]['test_loss'].mean())
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')