In [2]:
# Standard imports
import numpy as np # Version 1.20.1
import pandas as pd # Version 1.2.4
# Pandas settings
pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)

# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('ticks')
# Please reduce "DPI" values below if plots take too long to display 
# Higher values make plots remain clear when zoomed in
plt.rcParams['figure.dpi'] = 200
# The lowest recommended value is 200 for A4 size print legibility 
# The recommended value is 600 for 'photograph-like' legibility
plt.style.use('ggplot')
%matplotlib inline

# Time Series imports
import datetime as dt # Version 2.8.1

# Pre-Processing imports
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipe
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline as pipe
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# Modelling imports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, plot_roc_curve, precision_recall_fscore_support,  roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

## Function definition

In [3]:
# Functions go here
# Time lag column creator
def timelag_col_creator(df, columns, timelag_range):
    for col in columns:
        for num in range(1,timelag_range+1):
            df[col+'_lag'+str(num)] = df[col].shift(num)
            
# Rolling mean column creator
def rollmean_col_creator(df, columns, window_num):
    for col in columns:
        df['Weekly_'+col] = df[col].rolling(window=window_num).mean()
# Might be able to do away with roll mean function as it's not used.

### Time-lag Features

Considering in our earlier EDA, we discovered some semblance of seasonality in some of the variables, which indicate that values logged at time (t) may very likely be affected by previous values (t-1,t-2...). As data in the `train` dataset is recorded at a weekly frequency, we will create time-lag features for 4 weeks, allowing us to capture the monthly trends. Before doing so, we should create weekly rolling averages for some of these features.

In [None]:
# Initial creation of time-lag features based on selected subset of variables, with reference to correlation matrix
# cols_to_roll_mean = ['Tavg','WetBulb','Daylight_Hours','StnPressure','AvgSpeed','Wet_NoWet']
# rollmean_col_creator(weather, cols_to_roll_mean, 7)
# cols_to_time_lag = ['Weekly_Tavg','Weekly_WetBulb','Weekly_Daylight_Hours','Weekly_StnPressure','Weekly_AvgSpeed','Weekly_Wet_NoWet']
# timelag_col_creator(weather, cols_to_time_lag, 4)

In [None]:
# & SMOTE
# Standard Scaler

In [None]:
spray_cleaned = pd.read_csv('../data/spray_cleaned.csv')
train_cleaned = pd.read_csv('../data/train_cleaned.csv', parse_dates = ['Date'])
weather_cleaned = pd.read_csv('../data/weather_clean.csv', parse_dates = ['Date'])