In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import statsmodels.api as sm
from statsmodels.formula.api import ols
#from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, RobustScaler
#from catboost import CatBoostRegressor
#import category_encoders as ce
random_state = 123

In [None]:
train = pd.read_csv('data/train_after_outlier.csv', parse_dates=['ofd_date'])
train.head()

- Add week & add boolean variable if it is a weekend 
- Encode categorical variables 
- Get the model cols 
- Filter by station 
    - Shift the data 
    - Split the data 
    - Scale the data 
    - Create lags


In [None]:
def add_weekend(df):
    """ Adding whether it is a weekend or not as a column """ 
    
    df['is_weekend'] = np.where(df['day_of_week'] < 5, 0, 1)
    return df


def enhance_dates(df): 
    """ Adding a column that indicates the day of the week 0-Monday and 6-Sunday """
    
    df['day_of_week'] = df.apply(lambda x: x['ofd_date'].weekday(), axis=1)
    df.rename(columns = {'target':'Diff_val'}, inplace=True)
    df = add_weekend(df)
    return df

def encode_categorical(df): 
    """ Carry out OneHot encoding on the categorical variables """
    
    cols_to_encode = ['day_of_week', 'country_code']
    df[cols_to_encode] = df[cols_to_encode].astype('str')
    encoder=ce.OneHotEncoder(cols=cols_to_encode,handle_unknown='ignore',return_df=True,use_cat_names=True)
    encoded_df = encoder.fit_transform(df)
    return encoded_df


def get_model_cols(df): 
    """ Function to get the list of model features """
    all_cols = df.columns.tolist()
    drop_cols = ['index', 'ofd_date', 'fc_codes', 'Diff_val']
    model_cols = list(set(all_cols) - set(drop_cols))
    return model_cols
    
    
def shift_data(df): 
    """ Shifting the data so that the target variable aligns with the features """
    target_column = df['Diff_val'][1:]
    shifted_data = df.shift(1).dropna().drop(['Diff_val'], axis=1)
    shifted_data['target'] = target_column
    return shifted_data
    
def split_data(df): 
    """ Function to carry out train test split """
    df['Date'] = df['ofd_date'].apply(lambda x: x.strftime("%d-%m-%Y"))
    train_data = df[df['ofd_date'] < '2021-06-01'].set_index('ofd_date')
    test_data = df[df['ofd_date'] >= '2021-06-01'].set_index('ofd_date')
#     train_data.index.freq = 'd'
#     test_data.index.freq = 'd'
    return train_data, test_data

def scale_data(train, test): 
    """ Function to scale the data using a Robust Scaler """
    scaler = RobustScaler()
    scale_cols = ['OFD', 'Rollover', 'Returns', 'Slam', 'Earlies_Rec', 'R_Sideline', 'Sideline']
    train[scale_cols] = scaler.fit_transform(train[scale_cols])
    test[scale_cols] = scaler.transform(test[scale_cols])
    return train, test
    

def preprocessing(df): 
    """ Function that preprocesses the data by enhancing the dates and encoding categorical variables"""
    train = enhance_dates(df)
    train = encode_categorical(train)
    return train


def ml_pipeline(df): 
    
    
    
    # Enter code to select the station here 
    #ONCE STATION IS SELECTED 
    
    train = shift_data(df)
    train, test = split_data(train)
    model_columns = get_model_cols(train)
    train, test = scale_data(train, test)
    return train, test, model_columns
    
    
    


In [None]:
proc_train= preprocessing(train)

In [None]:
selected_station = 'D76'
selected_df = proc_train[proc_train['station_code'] == selected_station]

In [None]:
train, test, model_cols = ml_pipeline(selected_df)
train.shape, test.shape

In [None]:
combined_df = pd.concat([train, test])
combined_df

In [None]:
forecaster = ForecasterAutoreg(
                regressor = RandomForestRegressor(random_state=random_state),
                lags = 7)

In [None]:
forecaster.create_train_X_y(y=combined_df['target'], exog=combined_df[model_cols])

In [None]:
forecaster.create_train_X_y(y=combined_df['target'], exog=combined_df[model_cols])[0].columns

In [None]:
def get_time_series_data(data, time_lag): 
    proc_train  = preprocessing(data)
    station_codes = proc_train['station_code'].unique().tolist()
    for i in range(len(station_codes)): 
        print(f'Station code = {station_codes[i]}')
        selected_df = proc_train[proc_train['station_code'] == station_codes[i]]
        train, test, model_cols = ml_pipeline(selected_df)
        combined_df = pd.concat([train, test])
        forecaster = ForecasterAutoreg(
                regressor = RandomForestRegressor(random_state=random_state),
                lags = time_lag)
        X_data = forecaster.create_train_X_y(y=combined_df['target'], exog=combined_df[model_cols])[0]
        X_data['target'] = forecaster.create_train_X_y(y=combined_df['target'], exog=combined_df[model_cols])[1]
        X_data['Index'] =  X_data['Date'] + '_' + X_data['station_code']
        if i == 0: 
            transformed_data = X_data.copy()
        else: 
            transformed_data = pd.concat([transformed_data, X_data])
    
    transformed_data = transformed_data.reset_index().drop('index', axis=1)
    return transformed_data.sort_values(by='Index').set_index('Index')

In [None]:
transformed_data = get_time_series_data(train, 7)

In [None]:
transformed_data.columns

In [None]:
transformed_data.head()