## Prepare TGW input data for prediction

This notebook does the following:

1. Import TGW data
2. Process TGW data: rename columns to match training data, add timezone, add useful numerical time features (month, day, hour etc.), add weather features (e.g., last 24h temp average)
3. Save as dictionary

### Import Packages

In [1]:
import numpy as np
import pyarrow.parquet as pq
import joblib
import scipy
import pandas as pd
import time
import os
import re
import glob
from datetime import datetime, timedelta
import yaml
import pprint
import pytz
from timezonefinder import TimezoneFinder
import matplotlib.pyplot as plt

from src import figure_ops
from src import input_ops
from src import model_ops
from src import aux_ops

### Load config file 

In [1]:
config_file_name = 'config1'; config_path = f"config/{config_file_name}.yaml"; config = input_ops.load_config(config_path)
print(f"TGW_scenario:{config['TGW_scenario']} \nTGW_years:{config['TGW_weather_years']} \ninput_data_prediction_path: {config['input_data_prediction_path']}")

### Import, process and save TGW data

In [3]:
# Import, process and save load and weather data (run only once per dataset, e.g., run again to add new regions or years)
start_time = time.time()

input_data_prediction_path = config['input_data_prediction_path']
TGW_weather_years = config['TGW_weather_years']

# List of TGW locations to create data frames for
TGW_locations = ['Concord','SanFrancisco','Austin','Greensboro']   

# Loop through all TGW weather years and locations
for TGW_weather_year in TGW_weather_years:
    # Set TGW scenario, historical for years 1980-2019, chosen RSP for years 2020-2099
    if int(TGW_weather_year) < 2020:
        TGW_scenario = 'historical'
    else:
        TGW_scenario = config['TGW_scenario']
    for TGW_location in TGW_locations:
        # Import raw TGW weather data
        weather_data_path = f"main_folder/TGW/TGW_Distribution/Yearly/{TGW_location}/{TGW_scenario}/tgw_wrf_{TGW_scenario}_hourly_{TGW_weather_year}.csv"
        weather_df = input_ops.import_TGW_weather_data(weather_data_path, TGW_weather_year, TGW_location, start_month = 1, end_month = 12)
        # Add input features
        weather_df = model_ops.add_features_X_columns_D(weather_df)
        # Save data frame
        TGW_weather_df_save_path = f"{config['input_data_prediction_path']}/{TGW_location}/{TGW_scenario}/"
        # Check if directory exists - if not,create it. 
        if not os.path.exists(TGW_weather_df_save_path):
            os.makedirs(TGW_weather_df_save_path, exist_ok=True)
        # Save
        print(f'saving joblib for {TGW_location} {TGW_weather_year}  {TGW_scenario}')
        joblib.dump(weather_df, os.path.join(TGW_weather_df_save_path, f"TGW_weather_{TGW_weather_year}.joblib"))   

end_time = time.time(); print(f"Runtime for loading data: {(end_time - start_time) / 60:.2f} minutes")

### Load weather data frame

In [5]:
TGW_location = 'Greensboro'
TGW_scenario = config['TGW_scenario']
TGW_weather_year = config['TGW_weather_years'][0]
if int(TGW_weather_year) < 2020:
    TGW_scenario = 'historical'
else:
    TGW_scenario = config['TGW_scenario']
TGW_weather_df_save_path = f"{config['input_data_prediction_path']}/{TGW_location}/{TGW_scenario}/"
loaded_weather_df = joblib.load(os.path.join(TGW_weather_df_save_path, f"TGW_weather_{TGW_weather_year}.joblib"))