# Time Series forcasting with Machine Learning Models 
This notebook contains initial modeling for Lasso Regression, Ridge Regression, ElaticNet, Random Forests and Light GBM for time series. There is code for running a grid search with cross validation for RandomForest and LightGBM. 

In [None]:
import pandas as pd #for data analysis/manipulation
import numpy as np
#from azureml import Workspace # connect to the Azure environment 
import pyodbc # connect to the database
import matplotlib.pyplot as plt # plotting package 
import time
import pytz
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import datetime
from datetime import timedelta
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ParameterSampler
from sklearn.preprocessing import StandardScaler

In [None]:
def time_fmt(time_entry, original_tz = 'US/Pacific', new_tz = 'US/Eastern'):
    '''Convert the timezone for a timestamp object'''
    input_time = time_entry.replace(tzinfo=pytz.timezone(original_tz))
    conv_time = input_time.astimezone(pytz.timezone(new_tz))
    return conv_time


def mape_calc(actual, predicted):
    act, pred = np.array(actual), np.array(predicted)
    mape = np.mean(np.abs((act - pred)/act)*100)
    return mape

def accuracy_metrics(actual, predicted, print_values = True):

    metrics = []
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    mape = mape_calc(actual, predicted)
    
    metrics.append(mse)
    metrics.append(rmse)
    metrics.append(mae)
    metrics.append(mape)
    
    if print_values == True: 
        print('Accuracy Metrics')
        print('MSE: {}'.format(mse))
        print('RMSE: {}'.format(rmse))
        print('MAE: {}'.format(mae))
        print('MAPE: {}'.format(mape))
    
    return metrics

def inverse_difference(data, yhat, interval = 1):
    return yhat + data[-interval]

def predictions_plot(test, predictions, days= 14, col = 'target col'):

    test_sub = test.head(days)
    test_sub['Predictions'] = predictions
    test_sub = test_sub.reset_index()
    test_sub['Day'] = test_sub['Date'].apply(lambda x: x.weekday_name)
    
    plt.plot(test_sub['Date'], test_sub['Predictions'], color = 'red', label = 'Predictions')
    plt.plot(test_sub['Date'], test[col][:days].values)
    plt.title('Forecast Two Weeks Out')
    plt.xticks(rotation = 45)
    plt.legend(loc = 'best')
    plt.show()
    
    return test_sub


# Light Gradient Boosting Method

In [None]:
df_holdout = df[df['Date'] >= '2019-07-01']
train = df[df['Date'] < '2019-07-01']

features = [col for col in train.columns if col not in ['Date', 'target col']]
target = train['target col']

cat_features = [col for col in train.columns if col in ['Weekday Num', 'Month', 'Day', 'Year', 'WeekofMonth', 'WeekofYear', 'Quarter']]


In [None]:
len(df_holdout)
len(train)

In [None]:
# Define parameters 
params = {'metric': 'rmse',
         'objective': 'regression',
         'boost':'gbdt', 
         'learning_rate': 0.005,
         'lambda_l1': 0.5,
         'lambda_l2': 1,
         'num_leaves': 10,
         'max_depth': -1,
         'min_data_in_leaf': 5,
         'bagging_fraction': .9,
         'feature_fraction': .7
         }

tscv = TimeSeriesSplit(n_splits=7)
rmse = []

oof_preds = np.full(len(train), np.nan)

for train_index, test_index in tscv.split(train):
    
    train_data = lgb.Dataset(train.iloc[train_index][features].values, label = target.iloc[train_index].values, feature_name = features, categorical_feature = cat_features, free_raw_data = False)
    val_data = lgb.Dataset(train.iloc[test_index][features].values, label = target.iloc[test_index].values, feature_name =features, categorical_feature=cat_features, free_raw_data = False)
    
    mdl = lgb.train(params, train_data, 50000, valid_sets=[val_data], verbose_eval=500, early_stopping_rounds=500)
    oof_preds[test_index] = mdl.predict(train.iloc[test_index][features], num_iteration = mdl.best_iteration)
    print(mdl.best_iteration)
    
    actual = train.iloc[test_index]['target col']
    rmse.append(np.sqrt(mean_squared_error(actual, oof_preds[test_index])))

print('RMSE: {}'.format(np.mean(rmse)))
    
    

In [None]:
rmse

# Random Forest

In [None]:
df_holdout = df[df['Date'] >= '2019-07-01']
train = df[df['Date'] < '2019-07-01']

features = [col for col in train.columns if col not in ['Date', 'target col']]
target = train['target col']

cat_features = [col for col in train.columns if col in ['Weekday Num', 'Month', 'Day', 'Year', 'WeekofMonth', 'WeekofYear', 'Quarter']]

# list of the dates to split on  
date_list = pd.date_range(start = '2019-01-01', end = '2019-06-18', periods = 11)
rmse = []

predictions = []

for date in date_list:
    train_cv = train[train['Date'] < date]
    test_cv = train[train['Date'] >= date]
    
    x_train = train_cv[features].values
    y_train = train_cv['target col'].values
    x_test = test_cv[features].values
    y_test = test_cv['target col'].values
    
    regressor = RandomForestRegressor(n_estimators = 100, random_state = 0 
                                    #, min_samples_leaf = 5)
                                      ,min_samples_split = 2
                                      #, min_weight_fraction_leaf = 0.02
                                      , max_features = 5
                                      #, max_leaf_nodes = 
                                      , min_impurity_decrease = 2 
                                      #, min_impurity_split = 
                                     )
    regressor.fit(x_train, y_train)
    y_pred = regressor.predict(x_test)
    
    predictions.append(y_pred)
    actual = test_cv['target col']
    rmse.append(np.sqrt(mean_squared_error(actual, y_pred)))
    print(np.sqrt(mean_squared_error(actual, y_pred)))

print('RMSE: {}'.format(np.mean(rmse)))

# Ridge and Lasso Regression 


In [None]:
df_holdout = df[df['Date'] >= '2019-07-01']
train = df[df['Date'] < '2019-07-01']

features = [col for col in train.columns if col not in ['Date', 'target col']]
target = train['target col']

cat_features = [col for col in train.columns if col in ['Weekday Num', 'Month', 'Day', 'Year', 'WeekofMonth', 'WeekofYear', 'Quarter']]

# list of the dates to split on  
date_list = pd.date_range(start = '2019-01-01', end = '2019-06-18', periods = 11)
rmse = []

predictions = []

for date in date_list:
    train_cv = train[train['Date'] < date]
    test_cv = train[train['Date'] >= date]
    
    x_train = train_cv[features]
    y_train = train_cv['target col']
    x_test = test_cv[features]
    y_test = test_cv['target col']
    
    ridgereg = Ridge(alpha = 0.1 , normalize = True)
    ridgereg.fit(x_train, y_train)
    
    y_pred = ridgereg.predict(x_test)
    
    predictions.append(y_pred)

    rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    print(np.sqrt(mean_squared_error(y_test, y_pred)))

print('RMSE: {}'.format(np.mean(rmse)))

In [None]:
df_holdout = df[df['Date'] >= '2019-07-01']
train = df[df['Date'] < '2019-07-01']

features = [col for col in train.columns if col not in ['Date', 'target col']]
target = train['target col']

cat_features = [col for col in train.columns if col in ['Weekday Num', 'Month', 'Day', 'Year', 'WeekofMonth', 'WeekofYear', 'Quarter']]

# list of the dates to split on  
date_list = pd.date_range(start = '2019-01-01', end = '2019-06-18', periods = 11)
rmse = []

predictions = []

for date in date_list:
    train_cv = train[train['Date'] < date]
    test_cv = train[train['Date'] >= date]
    
    x_train = train_cv[features]
    y_train = train_cv['target col']
    x_test = test_cv[features]
    y_test = test_cv['target col']
    
    lassoreg = Lasso(alpha = 0.25 , normalize = True)
    lassoreg.fit(x_train, y_train)
    
    y_pred = lassoreg.predict(x_test)
    
    predictions.append(y_pred)

    rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    print(np.sqrt(mean_squared_error(y_test, y_pred)))

print('RMSE: {}'.format(np.mean(rmse)))

# Elastic Net 

In [None]:
df_holdout = df[df['Date'] >= '2019-07-01']
train = df[df['Date'] < '2019-07-01']

features = [col for col in train.columns if col not in ['Date', 'target col']]
target = train['target col']

cat_features = [col for col in train.columns if col in ['Weekday Num', 'Month', 'Day', 'Year', 'WeekofMonth', 'WeekofYear', 'Quarter']]

# list of the dates to split on  
date_list = pd.date_range(start = '2019-01-01', end = '2019-06-18', periods = 11)
rmse = []

predictions = []

for date in date_list:
    train_cv = train[train['Date'] < date]
    test_cv = train[train['Date'] >= date]
    
    x_train = train_cv[features]
    y_train = train_cv['target col']
    x_test = test_cv[features]
    y_test = test_cv['target col']
    
    enet = ElasticNet(alpha = 0.01 , l1_ratio = 0.01, normalize = True)
    enet.fit(x_train, y_train)
    
    y_pred = enet.predict(x_test)
    
    predictions.append(y_pred)

    rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    print(np.sqrt(mean_squared_error(y_test, y_pred)))

print('RMSE: {}'.format(np.mean(rmse)))

# Grid Search RandomForest 

In [None]:
start = datetime.datetime.today()
print(start)
n_estimators = [int(x) for x in np.linspace(start=200, stop = 2000, num =10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10,110, num =11)]
max_depth.append(None)
min_samples_split = [2,3,4,5,10]
min_samples_leaf = [1,2,4,5,10]
min_weight_fraction_leaf = [0.01,0.02,0.05,0.1,0.2,0.3, 0.4,0.5] 

random_grid = {'n_estimators': n_estimators, 
               'max_features': max_features, 
               'max_depth': max_depth,
               'min_samples_split': min_samples_split, 
               'min_samples_leaf': min_samples_leaf, 
               'min_weight_fraction_leaf': min_weight_fraction_leaf
}

df_holdout = df[df['Date'] >= '2019-06-01']
train = df[df['Date'] < '2019-06-01']

features = [col for col in train.columns if col not in ['Date', 'target col']]
target = train['target col']
cat_features = [col for col in train.columns if col in ['Weekday Num', 'Month', 'Day', 'Year', 'WeekofMonth', 'WeekofYear', 'Quarter']]
 
# list of the dates to split on  
date_list = pd.date_range(start = '2019-01-01', end = '2019-05-18', periods = 11)
rmse_agg = []
# this is the list of parameter combinations tried 
best_estimators = []
#predictions = []

for i in range(0,20): 
    rmse_dt = []
    # set random set of parameters 
    params = list(ParameterSampler(random_grid, n_iter =1))[0]
    rf_random = RandomForestRegressor(n_estimators = params['n_estimators'], max_features = params['max_features'], 
                                     max_depth = params['max_depth'], min_samples_split = params['min_samples_split'], 
                                     min_samples_leaf = params['min_samples_leaf'], min_weight_fraction_leaf= params['min_weight_fraction_leaf'])

    for date in date_list:
        train_cv = train[train['Date'] < date]
        test_cv = train[train['Date'] >= date]

        x_train = train_cv[features].values
        y_train = train_cv['target col'].values
        x_test = test_cv[features].values
        y_test = test_cv['target col'].values

        rf_random.fit(x_train, y_train)

        #rf_random.best_params_

        #rf_best = rf_random.best_estimator_
        
        rf_pred = rf_random.predict(x_test)

        rmse_dt.append(np.sqrt(mean_squared_error(y_test, rf_pred)))
    print('Run {} \n Params: {} RMSE: {}'.format(i, params, np.mean(rmse_dt)))
    rmse_agg.append(np.mean(rmse_dt))
    best_estimators.append(params)

    end = datetime.datetime.today()

print('Time to complete: {}'.format(end-start)) 

# Return the results 
results_rf = pd.DataFrame()
results_rf['estimators'] = best_estimators
results_rf['rmse'] = rmse_agg
results_rf

### Predict on the holdout set 

In [None]:
results_rf.iloc[8]['estimators']

In [None]:
# predict on the holdout set 
df_holdout = df[df['Date'] >= '2019-06-03']
train = df[df['Date'] < '2019-06-03']

train_cv = train
    
x_train = train_cv[features].values
y_train = train_cv['target col'].values


features = [col for col in train.columns if col not in ['Date', 'target col']]
x_val = df_holdout[features].values
y_val = df_holdout[['target col']].values

rf = RandomForestRegressor(n_estimators= 1600,
                        min_weight_fraction_leaf= 0.02,
                         min_samples_split= 10,
                         min_samples_leaf= 4,
                         max_features= 'auto',
                         max_depth=  110)

rf.fit(x_train, y_train)

pred = rf.predict(x_val)


#rmse.append(np.sqrt(mean_squared_error(y_val, pred)))
print(np.sqrt(mean_squared_error(y_val, pred)))

df_holdout['Predictions'] = pred 


df_pred = predictions_plot(df_holdout, pred[:19], days= 19, col = 'target col')

# LGB Grid Search 

In [None]:
start = datetime.datetime.today()
print(start)
boost = ['gbdt','dart', 'goss'] # 'rf',
learning_rate = [x for x in np.linspace(0.001 ,0.1, num = 9) ]
lambda_l1 = [x for x in np.linspace(0.00 ,1, num = 11) ]
lambda_l2 = [x for x in np.linspace(0.00 ,1, num = 11) ]
num_leaves = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth = [-1,10,20]
min_data_in_leaf = [5,10,15,20,25]
bagging_fraction = [x for x in np.linspace(0.01 ,1, num = 11)]
feature_fraction = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

param_grid = {'boosting': boost, 
         'learning_rate': learning_rate,
         'lambda_l1': lambda_l1,
         'lambda_l2': lambda_l2,
         'num_leaves': num_leaves,
         'max_depth': max_depth,
         'min_data_in_leaf': min_data_in_leaf,
         'bagging_fraction': bagging_fraction,
         'feature_fraction': feature_fraction
         }

df_holdout = df[df['Date'] >= '2019-07-01']
train = df[df['Date'] < '2019-07-01']

features = [col for col in train.columns if col not in ['Date', 'target col']]
target = train['target col']
cat_features = [col for col in train.columns if col in ['Weekday Num', 'Month', 'Day', 'Year', 'WeekofMonth', 'WeekofYear', 'Quarter']]
 
# list of the dates to split on  
date_list = pd.date_range(start = '2019-01-01', end = '2019-06-18', periods = 11)
rmse_agg = []
# this is the list of parameter combinations tried 
best_estimators = []
#predictions = []

for i in range(0,20): 
    rmse_dt = []
    # set random set of parameters 
    params = list(ParameterSampler(param_grid, n_iter =1))[0]
    params['metric'] = 'rmse'
    params['objective'] = 'regression'
    #print(params)
    for date in date_list:
        train_cv = train[train['Date'] < date]
        test_cv = train[train['Date'] >= date]
        train_data = lgb.Dataset(train_cv[features].values, label = train_cv['target col'].values, feature_name = features, categorical_feature = cat_features, free_raw_data = False)
        val_data = lgb.Dataset(test_cv[features].values, label = test_cv['target col'].values, feature_name =features, categorical_feature=cat_features, free_raw_data = False)
 
        mdl = lgb.train(params, train_data, 50000, valid_sets=[val_data], verbose_eval=500, early_stopping_rounds=500)
        predictions = mdl.predict(test_cv[features], num_iteration = mdl.best_iteration)   
        actual = test_cv['target col']
        
        rmse_dt.append(np.sqrt(mean_squared_error(actual, predictions)))
        #print(np.sqrt(mean_squared_error(y_test, rf_pred)))
    print('Run {} \n Params: {} RMSE: {}'.format(i, params, np.mean(rmse_dt)))
    rmse_agg.append(np.mean(rmse_dt))
    best_estimators.append(params)

    end = datetime.datetime.today()

print('Time to complete: {}'.format(end-start)) 

# Return the results 
results_lgb = pd.DataFrame()
results_lgb['estimators'] = best_estimators
results_lgb['rmse'] = rmse_agg
results_lgb

In [None]:
results_lgb.iloc[1]['estimators']

In [None]:
# predict on the holdout set 
df_holdout = df[df['Date'] >= '2019-07-01']
train = df[df['Date'] < '2019-06-01']

train_cv = train
test_cv = df[(df['Date'] >= '2019-06-01') & (df['Date'] < '2019-07-01')]
    
x_train = train_cv[features].values
y_train = train_cv['target col'].values
x_test = test_cv[features].values
y_test = test_cv['target col'].values

features = [col for col in train.columns if col not in ['Date', 'target col']]
x_val = df_holdout[features].values
y_val = df_holdout[['target col']].values


train_data = lgb.Dataset(x_train, label = y_train, feature_name = features, categorical_feature = cat_features, free_raw_data = False)
val_data = lgb.Dataset(x_test, label = y_test, feature_name =features, categorical_feature=cat_features, free_raw_data = False)


params = {'num_leaves': 40,
 'min_data_in_leaf': 10,
 'max_depth': -1,
 'learning_rate': 0.001,
 'lambda_l2': 0.6000000000000001,
 'lambda_l1': 0.0,
 'feature_fraction': 0.7,
 'boosting': 'gbdt',
 'bagging_fraction': 0.109,
 'metric': 'rmse',
 'objective': 'regression'}

mdl = lgb.train(params, train_data, 50000, valid_sets=[val_data], verbose_eval=500, early_stopping_rounds=500)
predictions = mdl.predict(test_cv[features], num_iteration = mdl.best_iteration)  


pred = mdl.predict(x_val, num_iteration = mdl.best_iteration)
#rmse.append(np.sqrt(mean_squared_error(y_val, pred)))
print(np.sqrt(mean_squared_error(y_val, pred)))

df_holdout['Predictions'] = pred 
print(df_holdout[['Date', 'target col', 'Predictions']])

df_pred = predictions_plot(df_holdout, pred, days= 19, col = 'target col')

# ML Results for Best models
Forecasting with 2 week intervals 

## Random Forest 

In [None]:
df_holdout = df[df['Date'] >= '2019-08-03']
train = df[df['Date'] < '2019-08-03']

features = [col for col in train.columns if col not in ['Date', 'target col']]
target = train['target col']

cat_features = [col for col in train.columns if col in ['Weekday Num', 'Month', 'Day', 'Year', 'WeekofMonth', 'WeekofYear', 'Quarter']]

# list of the dates to split on  
date_list = pd.date_range(start = '2019-03-01', end = '2019-07-18', periods = 10)
rmse = []
predictions = []
#actual_df = pd.DataFrame()

for j in range(0, len(date_list)):
    date = date_list[j]
    try:
        date_next = date_list[j+1]
    except: 
        date_next = date + timedelta(days = 15)
    
    train_cv = train[train['Date'] < date]
    test_cv = train[(train['Date'] >= date) & (train['Date'] <= date_next)]
  
    x_train = train_cv[features].values
    y_train = train_cv['target col'].values
    x_test = test_cv[features].values
    y_test = test_cv['target col'].values
    
    regressor = RandomForestRegressor(n_estimators= 800,
                         min_weight_fraction_leaf= 0.01,
                         min_samples_split= 3,
                         min_samples_leaf= 5,
                         max_features= 'auto',
                         max_depth= 60 
                                     )
    regressor.fit(x_train, y_train)
    y_pred = regressor.predict(x_test)
    for i in range(0, len(y_pred)):
        predictions.append(y_pred[i])
    actual = test_cv['target col']
    rmse.append(np.sqrt(mean_squared_error(actual, y_pred)))
    print(np.sqrt(mean_squared_error(actual, y_pred)))

print('RMSE: {}'.format(np.mean(rmse)))

actual_df = train[(train['Date'] >= '2019-03-01') & (train['Date'] < '2019-08-03')]
actual_df = actual_df[['Date', 'target col']]
actual_df['predictions'] = predictions[:-1]

accuracy_metrics(actual_df['target col'], predictions[:-1], print_values = True)
df_pred = predictions_plot(actual_df, predictions[:-1], days=141 , col = 'target col')

In [None]:
df_pred[['Date', 'target col', 'Predictions']]

### LightGB

In [None]:
df_holdout = df[df['Date'] >= '2019-08-03']
train = df[df['Date'] < '2019-08-03']

features = [col for col in train.columns if col not in ['Date', 'target col']]
target = train['target col']

cat_features = [col for col in train.columns if col in ['Weekday Num', 'Month', 'Day', 'Year', 'WeekofMonth', 'WeekofYear', 'Quarter']]

# Define parameters 
params = {'num_leaves': 40,
         'min_data_in_leaf': 10,
         'max_depth': -1,
         'learning_rate': 0.001,
         'lambda_l2': 0.6000000000000001,
         'lambda_l1': 0.0,
         'feature_fraction': 0.7,
         'boosting': 'gbdt',
         'bagging_fraction': 0.109,
         'metric': 'rmse',
         'objective': 'regression'}

# list of the dates to split on .. 
date_list = pd.date_range(start = '2019-03-01', end = '2019-07-18', periods = 10)
rmse = []


predictions = []
actual_df = pd.DataFrame()

for date in date_list:
    train_cv = train[train['Date'] < date]
    test_cv = train[train['Date'] >= date]
    
    train_data = lgb.Dataset(train_cv[features].values, label = train_cv['target col'].values, feature_name = features, categorical_feature = cat_features, free_raw_data = False)
    val_data = lgb.Dataset(test_cv[features].values, label = test_cv['target col'].values, feature_name =features, categorical_feature=cat_features, free_raw_data = False)
    
    mdl = lgb.train(params, train_data, 50000, valid_sets=[val_data], verbose_eval=500, early_stopping_rounds=500)
    pred = mdl.predict(test_cv[features], num_iteration = mdl.best_iteration)
    #print(mdl.best_iteration)
    
    actual = test_cv['target col']
    rmse.append(np.sqrt(mean_squared_error(actual, pred)))
    print(np.sqrt(mean_squared_error(actual, pred)))
    
    pred = pred.tolist()
    print(type(pred))
    print(type(actual))
    #predictions = np.vstack([predictions, pred])
    predictions.append(pred)
    actual_df.append(actual)

print('RMSE: {}'.format(np.mean(rmse)))

