In [1]:
import pandas as pd
import numpy as np
import time_series_module as tsm
import time_series_cross_valid as tscv
import time_series_versioning as tsver
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from importlib import reload
from statsmodels.graphics import tsaplots
from statsmodels.api import tsa

### Features & targets

In [2]:
df = pd.read_csv('train.csv')
df['date_time'] = pd.to_datetime(df['date_time'])

In [3]:
dt_features = tsm.get_date_time_features(df, 'date_time', hour = [True, False], day = [True, False],
                                                month = [True, False], season = [True, False], year = [False])
df.drop(columns = ['date_time'], inplace = True)

In [4]:
dt_features

['hour', 'day', 'month', 'season']

In [5]:
df.head(3)

Unnamed: 0,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides,hour,day,month,season
0,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7,18,10,3,2
1,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9,19,10,3,2
2,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1,20,10,3,2


In [6]:
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [7]:
def get_features_list(data, date_time_columns):
    return [feature for feature in data.columns if feature not in targets + date_time_columns]

In [8]:
features = get_features_list(df, dt_features)

### LightGBM

In [9]:
import lightgbm as lgbm

In [10]:
lgbm_forecast_model = tsm.ForecastModel(dt_features, df, features + [targets[0]], targets[0], prior_lag = list(range(1,10)))
lgbm_forecast_model.forecast_prep(True)
lgbm_data = lgbm_forecast_model.data
lgbm_data.head(3)

Unnamed: 0,lag_deg_C_1,lag_deg_C_2,lag_deg_C_3,lag_deg_C_4,lag_deg_C_5,lag_deg_C_6,lag_deg_C_7,lag_deg_C_8,lag_deg_C_9,lag_relative_humidity_1,...,lag_target_carbon_monoxide_5,lag_target_carbon_monoxide_6,lag_target_carbon_monoxide_7,lag_target_carbon_monoxide_8,lag_target_carbon_monoxide_9,lag_target_carbon_monoxide_0,hour,day,month,season
0,10.1,10.3,10.7,11.2,11.9,11.0,12.6,13.2,13.1,62.7,...,1.5,2.2,2.2,2.1,2.5,0.6,3,11,3,2
1,10.5,10.1,10.3,10.7,11.2,11.9,11.0,12.6,13.2,59.6,...,1.2,1.5,2.2,2.2,2.1,0.7,4,11,3,2
2,9.4,10.5,10.1,10.3,10.7,11.2,11.9,11.0,12.6,59.9,...,1.2,1.2,1.5,2.2,2.2,0.7,5,11,3,2


In [11]:
def mape(y, y_hat):
    return 100*np.sum(np.abs((y - y_hat)/y))/len(y)

In [12]:
class LGBM_Model:
    def __init__(self, model):
        self.model = model
    def fit(self, X, y):
        self.fitted_model = self.model.fit(X,y)
        self.importances = self.fitted_model.feature_importances_
        return self
    def predict(self, X):
        return self.fitted_model.predict(X)

In [13]:
lgbm_reg = LGBM_Model(lgbm.LGBMRegressor(max_depth = 3))
cv_obj = tscv.CrossValid(2048, 1)
cv_split = cv_obj.split(len(df), step = 250)

In [14]:
results = tscv.new_cv(lgbm_data, lgbm_forecast_model.features + lgbm_forecast_model.date_time, 
                      lgbm_forecast_model.targets, cv_split, lgbm_reg, mape, True)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [15]:
print('train_loss',np.mean(results['train_loss']))
print('test_loss',np.mean(results['test_loss']))

train_loss 17.401399294398516
test_loss 13.878100839842475


In [16]:
results['train_loss']

[18.016489641259152,
 18.24155593076042,
 18.653227817117735,
 20.076222214888087,
 19.13448315020091,
 19.55711449869083,
 18.045763949184067,
 17.6071664984707,
 16.198362148361078,
 16.156471763220704,
 16.17785836436842,
 16.5264855217635,
 15.916255942759738,
 15.756647342684065,
 15.544858290453591,
 16.2298971032653,
 16.882024858670665,
 17.612038190135216,
 17.553753780169558,
 17.655443973587737,
 17.88726420235733]

In [17]:
results['test_loss']

[30.47116890675929,
 18.03385250747214,
 16.84930997406323,
 27.154844276739194,
 2.526919788548644,
 3.9907745419245586,
 23.035408214994703,
 14.400369847461262,
 9.678982194196074,
 17.20391559253836,
 6.057294183780046,
 14.91636244362817,
 17.879334509217607,
 14.59158713649009,
 3.225696165661307,
 4.644181675787534,
 0.056897107041541516,
 7.489192352885765,
 18.79646332078455,
 26.288363859938123,
 14.149199036779825]

In [18]:
params = lgbm_forecast_model.get_params()

In [19]:
from functools import reduce

In [20]:
def squeeze(arr):
    return reduce(lambda x,y : x + y, arr) / len(arr)

In [21]:
lgbm_importances = squeeze(results['importances']).sort_values(ascending = False)

In [29]:
lgbm_importances.head(60)

hour                            75.761905
lag_target_carbon_monoxide_1    50.190476
lag_sensor_2_1                  28.714286
lag_sensor_1_1                  25.000000
lag_sensor_5_1                  15.666667
lag_sensor_3_1                  15.095238
lag_deg_C_9                     13.571429
lag_sensor_5_9                  13.476190
lag_sensor_4_1                  12.809524
lag_sensor_2_3                  11.333333
lag_target_carbon_monoxide_2    10.904762
lag_sensor_1_2                  10.095238
lag_sensor_2_2                   9.857143
lag_sensor_1_3                   9.285714
lag_sensor_2_4                   8.714286
lag_relative_humidity_9          8.714286
lag_target_carbon_monoxide_3     8.142857
lag_relative_humidity_1          7.714286
lag_sensor_2_9                   7.571429
lag_sensor_4_2                   7.190476
lag_sensor_3_2                   7.142857
day                              6.904762
lag_sensor_5_8                   6.904762
lag_target_carbon_monoxide_4     6

In [23]:
rcParams['figure.figsize'] = 30, 8