In [34]:
import os
import sys
import datetime

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# Add the parent directory to the Python path to import WeatherData class
CURRENT_DIR = os.path.normpath(os.path.dirname(os.getcwd()))
# parent_dir = os.path.normpath(os.path.dirname(CURRENT_DIR))
sys.path.append(CURRENT_DIR)

from classes.weather_data_transform import WeatherDataTransform


In [35]:
weather_data_dir = "../../data/weather_data"

sorted_filenames = sorted(
    [file.split(".")[0] for file in os.listdir(os.path.normpath(weather_data_dir)) if ".parquet" in file and "transformed" not in file]
)
sorted_file_dirs = [os.path.join(
    os.path.normpath(weather_data_dir), file
) for file in sorted_filenames]

for file in sorted_filenames:
    # Define file paths to load and save
    file_path = os.path.join(os.path.normpath(weather_data_dir), f"{file}.parquet")
    save_path = os.path.join(os.path.normpath(weather_data_dir), f"{file}_transformed.parquet")
    # Transform data
    df = WeatherDataTransform.transform_individual_locations(
        file_path=file_path
    )
    # Save transformed data
    df.to_parquet(save_path)

In [36]:
df_transformed = WeatherDataTransform.transform_individual_locations(
    "../../data/weather_data/3316_crosby_weather_data.parquet"
)
df_transformed.head()

Unnamed: 0_level_0,day,minutes_after_midnight,temperature,weather,hour_number,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-26 19:00:00,2024-01-26,1140,7.0,0,19,19:00:00
2024-01-26 20:00:00,2024-01-26,1200,6.7,0,20,20:00:00
2024-01-26 21:00:00,2024-01-26,1260,6.9,2,21,21:00:00
2024-01-26 22:00:00,2024-01-26,1320,8.0,4,22,22:00:00
2024-01-26 23:00:00,2024-01-26,1380,7.3,4,23,23:00:00


In [37]:
weather_data_dir = "../../data/weather_data"

sorted_transformed_filenames = sorted(
    [file for file in os.listdir(os.path.normpath(weather_data_dir)) if ".parquet" in file and "transformed" in file]
)
sorted_transformed_file_dirs = [os.path.join(
    os.path.normpath(weather_data_dir), file
) for file in sorted_transformed_filenames]

df_dict = {
    "crossby_df": pd.read_parquet(sorted_transformed_file_dirs[0]),
    "bingley_df": pd.read_parquet(sorted_transformed_file_dirs[1]),
    "rostherne_df": pd.read_parquet(sorted_transformed_file_dirs[2]),
    "watnall_df": pd.read_parquet(sorted_transformed_file_dirs[3]),
    "coleshill_df": pd.read_parquet(sorted_transformed_file_dirs[4]),
    "heathrow_df": pd.read_parquet(sorted_transformed_file_dirs[5]),
    "thorney_df": pd.read_parquet(sorted_transformed_file_dirs[6])
}

df_combined = WeatherDataTransform.generate_all_information_df(
    df_dict
)

save_path = os.path.join(os.path.normpath(weather_data_dir), f"weather_data_combined.parquet")
df_combined.to_parquet(save_path)

In [38]:
weather_dict_original = {
    0: "Clear night",
    1: "Sunny day",
    2: "Partly cloudy (night)",
    3: "Partly cloudy (day)",
    4: "Not used",
    5: "Mist",
    6: "Fog",
    7: "Cloudy",
    8: "Overcast",
    9: "Light rain shower (night)",
    10: "Light rain shower (day)",
    11: "Drizzle",
    12: "Light rain",
    13: "Heavy rain shower (night)",
    14: "Heavy rain shower (day)",
    15: "Heavy rain",
    16: "Sleet shower (night)",
    17: "Sleet shower (day)",
    18: "Sleet",
    19: "Hail shower (night)",
    20: "Hail shower (day)",
    21: "Hail",
    22: "Light snow shower (night)",
    23: "Light snow shower (day)",
    24: "Light snow",
    25: "Heavy snow shower (night)",
    26: "Heavy snow shower (day)",
    27: "Heavy snow",
    28: "Thunder shower (night)",
    29: "Thunder shower (day)",
    30: "Thunder?"
}

weather_dict_merged = {
    0: "Clear night",
    1: "Sunny day",
    2: "Partly cloudy",
    3: "Not used",
    4: "Mist",
    5: "Fog",
    6: "Cloudy",
    7: "Overcast",
    8: "Light rain",
    9: "Drizzle",
    10: "Heavy rain",
    11: "Sleet",
    12: "Hail",
    13: "Light snow",
    14: "Heavy snow",
    15: "Thunder"
}

weather_dict_mapping = {
    0: 0,
    1: 1,
    2: 2,
    3: 2,
    4: 3,
    5: 4,
    6: 5,
    7: 6,
    8: 7,
    9: 8,
    10: 8,
    11: 9,
    12: 8,
    13: 10,
    14: 10,
    15: 10,
    16: 11,
    17: 11,
    18: 11,
    19: 12,
    20: 12,
    21: 12,
    22: 13,
    23: 13,
    24: 13,
    25: 14,
    26: 14,
    27: 14,
    28: 15,
    29: 15,
    30: 15
}

# Things to do

* Decide which columns are the important columns. Think about what information is available for the predictions
* Remove outliers for the chosen columns and for all datasets
* Once cleaned, it's time for feature engineering:
    * Let's use an weighed-average temperature based on population of the area, e.g. if London has 10% of the population, then london_temp*0.1
    * Try to use an "average" climate condition: e.g. cloudy, sunny... It could be encoded as a number, but it wouldn't make much sense in real life
