# Imports

In [None]:
import os
import yaml
import joblib
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader
import torch
import time
import shap
sns.set()

os.chdir(r'C:\Users\Mubby\OneDrive - University of Kentucky/DOCS/SPRING 2022/DS 711/Capstone python/scripts')
import preprocess, train, inference, interpret

In [None]:
with open("../model/params.yaml", "r") as params_file:
    params = yaml.safe_load(params_file)

data_dir = params['data_dir']
model_dir = params['model_dir']

# Functions

In [None]:
def create_features(
        df
):
    """Creates new features."""
    
    # add date-derived features
    df['Day_Of_Week'] = pd.DatetimeIndex(df['Date']).dayofweek
    df['Month_Of_Year'] = pd.DatetimeIndex(df['Date']).month
    df['Quarter_Of_Year'] = pd.DatetimeIndex(df['Date']).quarter
    df['Day_of_Year'] = pd.DatetimeIndex(df['Date']).dayofyear
    #drop cols----
    #constants
    df = df.drop(columns = ['Date'])
    df = df.drop(columns = ['County'])
    df = df.drop(columns = ['State'])
    df = df.drop(columns = ['Age-65'])
    df = df.drop(columns = ['Income-PC'])
    df = df.drop(columns = ['Lat'])
    df = df.drop(columns = ['Long'])
    df = df.drop(columns = ['Population'])
    
    #changing
    df = df.drop(columns = ['New Deaths'])
    #df = df.drop(columns = ['Cases'])
    df = df.drop(columns = ['New Cases'])
    
    
    new_cols = ['Deaths', 'Cases', 'temp', 'humidity', 'windspeed', 'Day_Of_Week', 'Month_Of_Year', 'Quarter_Of_Year', 'Day_of_Year']
    df=df[new_cols]
    df=df.reindex(columns=new_cols)
  
    # drop rows with missing values
    df = df.dropna()
    
    return df

In [None]:
def split_data(
        df,
        train_frac
):
    train_size = int(len(df) * train_frac)
    train_df, test_df = df[:train_size], df[train_size:]

    return train_df, test_df, train_size

In [None]:
def rescale_data(
        df
):
    """Rescale all features using MinMaxScaler() to same scale, between 0 and 1."""
    
    scaler = MinMaxScaler()
    scaler = scaler.fit(df)

    df_scaled = pd.DataFrame(
        scaler.transform(df),
        index=df.index,
        columns=df.columns)

    # save trained data scaler
    joblib.dump(scaler, Path(model_dir, 'scaler.gz'))
    
    return df_scaled

# Load & Prepare Data

In [None]:
file_name = r'C:\Users\Mubby\OneDrive - University of Kentucky/DOCS/SPRING 2022/DS 711/Capstone python/covidweather.csv'
data = pd.read_csv(file_name)
data['Date'] = pd.to_datetime(data['Date'])
data

# Visualize each Location

In [None]:
# # #visualize
# vdata = data.copy()
# vdata['C-State'] = vdata[['County', 'State']].apply(lambda x: ','.join(x), axis=1)
# vdata.drop(['County', 'State'], axis=1, inplace=True)


# for i in vdata['C-State'].unique():
#     subset = vdata[(vdata['C-State'] == i)]
#     subsetV = subset.copy()
#     subsetV['Moving_Average'] = subsetV['New Deaths'].rolling(10).mean()
#     fig = px.line(subsetV, x="Date", y=["New Deaths","Moving_Average"], title = i, template = 'plotly_dark')
#     fig.show()

# TRAINING

In [None]:

df = data[(data['State'] == 'KY')]
df = df.drop(columns = ['UID'])


#create features
df = create_features(df)

#split into train/test datasets (may 13th 2021)
train_df, test_df, train_size = split_data(df, 0.6)

#rescale
train_df = rescale_data(train_df)
#use scaler for test data
scaler = joblib.load(Path(model_dir, 'scaler.gz'))
test_df = pd.DataFrame(
    scaler.transform(test_df),
    index=test_df.index,
    columns=test_df.columns)

#check

#train_df = train_df.rename(columns={"temp": "Close"})    
#test_df = test_df.rename(columns={"temp": "Close"})   


train_df.to_csv(r'C:\Users\Mubby\OneDrive - University of Kentucky\DOCS\SPRING 2022\DS 711\Capstone python\data\train.csv', index = False)
test_df.to_csv(r'C:\Users\Mubby\OneDrive - University of Kentucky\DOCS\SPRING 2022\DS 711\Capstone python\data\test.csv', index = False)

df

In [None]:
sequence_length = 30
batch_size = 64
n_epochs = 20
n_epochs_stop = 10
label_name = 'Deaths'

In [None]:
hist = train.train_model(train_df, test_df, label_name, sequence_length, batch_size, n_epochs, n_epochs_stop)

In [None]:
hist.plot(figsize=(12,6), cmap='viridis')
plt.show()

# Evaluate Model

In [None]:
predictions_descaled, labels_descaled = inference.predict(df=test_df,
                                                          label_name='Deaths',
                                                          sequence_length=sequence_length)

predictions_descaled.shape, predictions_descaled.shape

In [None]:
print('Error on all test data:')
inference.print_loss_metrics(labels_descaled, predictions_descaled)
print('\n')
#print('Error on partial test data:')
#inference.print_loss_metrics(labels_descaled[:30], predictions_descaled[:30])

In [None]:
df_pred = pd.DataFrame()
df_pred['Predicted'] = predictions_descaled
df_pred['True'] = labels_descaled
df_pred['Residual'] = labels_descaled - predictions_descaled
df_pred.plot(figsize=(12,6), cmap='viridis')
plt.xticks(rotation=45)
#plt.xlim(0,300)
plt.xlabel('Days')
plt.ylabel('Deaths')
plt.legend()
plt.show()

# Feature Importance

In [None]:
background_data_size = 630
test_sample_size = 100
sequence_length = 30

In [None]:
shap_values = interpret.get_important_features(
    label_name,
    background_data_size,
    test_sample_size,
    sequence_length
)

In [None]:
shap_plot = pd.DataFrame(shap_values, columns=train_df.columns.tolist())
shap_plot['days'] = [i-31 for i in list(range(1,31))]
shap_plot.head()

In [None]:
shap_plot.plot.area(x='days',figsize=(10, 6), cmap='Pastel1')
plt.title("Deep SHAP - Feature Importance")
plt.show()

In [None]:
shap.summary_plot(shap_values, plot_type = 'bar', feature_names = train_df.columns)