In [1]:
import os
import sys
import datetime

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# Add the parent directory to the Python path to import WeatherData class
CURRENT_DIR = os.path.normpath(os.path.dirname(os.getcwd()))
# parent_dir = os.path.normpath(os.path.dirname(CURRENT_DIR))
sys.path.append(CURRENT_DIR)

from classes.weather_data_transform import WeatherDataTransform


In [2]:
weather_data_dir = "../../data/weather_data"

sorted_filenames = sorted(
    [file.split(".")[0] for file in os.listdir(os.path.normpath(weather_data_dir)) if ".parquet" in file and "transformed" not in file]
)
sorted_file_dirs = [os.path.join(
    os.path.normpath(weather_data_dir), file
) for file in sorted_filenames]

for file in sorted_filenames:
    # Define file paths to load and save
    file_path = os.path.join(os.path.normpath(weather_data_dir), f"{file}.parquet")
    save_path = os.path.join(os.path.normpath(weather_data_dir), f"{file}_transformed.parquet")
    # Transform data
    df = WeatherDataTransform.transform_individual_locations(
        file_path=file_path
    )
    # Save transformed data
    df.to_parquet(save_path)

In [3]:
df_transformed = WeatherDataTransform.transform_individual_locations(
    "../../data/weather_data/3316_crosby_weather_data.parquet"
)
df_transformed.head()

Unnamed: 0_level_0,day,minutes_after_midnight,temperature,weather,hour_number,hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-26 19:00:00,2024-01-26,1140,7.0,0,19,19:00:00
2024-01-26 20:00:00,2024-01-26,1200,6.7,0,20,20:00:00
2024-01-26 21:00:00,2024-01-26,1260,6.9,2,21,21:00:00
2024-01-26 22:00:00,2024-01-26,1320,8.0,4,22,22:00:00
2024-01-26 23:00:00,2024-01-26,1380,7.3,4,23,23:00:00


In [4]:
# df_transformed.loc[(df_transformed.index > "02-10-2024") & (df_transformed.index < "02-17-2024")]["temperature"].plot()

In [5]:
weather_dict_original = {
    0: "Clear night",
    1: "Sunny day",
    2: "Partly cloudy (night)",
    3: "Partly cloudy (day)",
    4: "Not used",
    5: "Mist",
    6: "Fog",
    7: "Cloudy",
    8: "Overcast",
    9: "Light rain shower (night)",
    10: "Light rain shower (day)",
    11: "Drizzle",
    12: "Light rain",
    13: "Heavy rain shower (night)",
    14: "Heavy rain shower (day)",
    15: "Heavy rain",
    16: "Sleet shower (night)",
    17: "Sleet shower (day)",
    18: "Sleet",
    19: "Hail shower (night)",
    20: "Hail shower (day)",
    21: "Hail",
    22: "Light snow shower (night)",
    23: "Light snow shower (day)",
    24: "Light snow",
    25: "Heavy snow shower (night)",
    26: "Heavy snow shower (day)",
    27: "Heavy snow",
    28: "Thunder shower (night)",
    29: "Thunder shower (day)",
    30: "Thunder?"
}

weather_dict_merged = {
    0: "Clear night",
    1: "Sunny day",
    2: "Partly cloudy",
    3: "Not used",
    4: "Mist",
    5: "Fog",
    6: "Cloudy",
    7: "Overcast",
    8: "Light rain",
    9: "Drizzle",
    10: "Heavy rain",
    11: "Sleet",
    12: "Hail",
    13: "Light snow",
    14: "Heavy snow",
    15: "Thunder"
}

weather_dict_mapping = {
    0: 0,
    1: 1,
    2: 2,
    3: 2,
    4: 3,
    5: 4,
    6: 5,
    7: 6,
    8: 7,
    9: 8,
    10: 8,
    11: 9,
    12: 8,
    13: 10,
    14: 10,
    15: 10,
    16: 11,
    17: 11,
    18: 11,
    19: 12,
    20: 12,
    21: 12,
    22: 13,
    23: 13,
    24: 13,
    25: 14,
    26: 14,
    27: 14,
    28: 15,
    29: 15,
    30: 15
}

In [6]:
weather_data_dir = "../../data/weather_data"

sorted_filenames = sorted(
    [file for file in os.listdir(os.path.normpath(weather_data_dir)) if ".parquet" in file and "transformed" in file]
)
sorted_file_dirs = [os.path.join(
    os.path.normpath(weather_data_dir), file
) for file in sorted_filenames]

df_dict = {
    "crossby_df": pd.read_parquet(sorted_file_dirs[0]),
    "bingley_df": pd.read_parquet(sorted_file_dirs[1]),
    "rostherne_df": pd.read_parquet(sorted_file_dirs[2]),
    "watnall_df": pd.read_parquet(sorted_file_dirs[3]),
    "coleshill_df": pd.read_parquet(sorted_file_dirs[4]),
    "heathrow_df": pd.read_parquet(sorted_file_dirs[5]),
    "thorney_df": pd.read_parquet(sorted_file_dirs[6])
}

In [7]:
def calculate_scaled_temperature(
        df_dict: dict[pd.DataFrame],
        population_scaling: dict[float]
) -> pd.Series:
    """_summary_

    Args:
        * df_dict (dict[pd.DataFrame]): _description_
        * population_scaling (dict[float]): _description_

    Returns:
        * pd.Series: _description_
    """
    scaled_temperature_all = 0
    # Iterate over each key-value pair in df_dict
    for df_name, df in df_dict.items():
        # Multiply the "temperature" column by the scaling factor for the current DataFrame
        scaled_temperature = df['temperature'] * population_scaling[df_name]
        print(scaled_temperature)
        # Add the scaled temperature to the total sum
        scaled_temperature_all += scaled_temperature

    return scaled_temperature_all

def get_max_weather(
        weather_vals: pd.Series, population_scaling: dict[float]
) -> list:
    weather_aggregates = []
    unique_weathers = set(weather_vals)
    for weather in unique_weathers:
        weather_locations = weather_vals[weather_vals == weather].index.to_list()
        aggregate = sum([population_scaling[location] for location in weather_locations if location in population_scaling])
        weather_aggregates.append(aggregate)
        # print(f"Sum of weights for weather {weather} is {aggregate}")
    index_max = weather_aggregates.index(max(weather_aggregates))
    # print(f"Weather to keep is: {list(unique_weathers)[index_max]}")

    return list(unique_weathers)[index_max]

def generate_all_information_df(df_dict: dict[pd.DataFrame]) -> pd.DataFrame:
    # Create a dict containing the population in millions
    # of the closest big city
    area_population = {
        "crossby_df": 0.9, # Liverpool
        "bingley_df": 0.8, # Leeds
        "rostherne_df": 2.9, # Manchester
        "watnall_df": 0.8, # Nottingham
        "coleshill_df": 4.3, # Birmingham
        "heathrow_df": 9.5, # London
        "thorney_df": 1.5 # Southampton and Portsmouth
    }
    total_population = sum([value for value in area_population.values()])
    # Calculate population ration for each of the locations
    population_scaling = {key:value/total_population for (key, value) in area_population.items()}
    
    # Find the common indices
    indices_list = [df.index for df in df_dict.values()]
    common_indices = sorted(list(set(indices_list[0]).intersection(*indices_list[1:])))
    # Keep the common indices for each of the dataframes
    df_dict = {df_name:df.loc[common_indices] for (df_name,df) in df_dict.items()}

    # Extract scaled temperature across all the locations
    scaled_temperature = calculate_scaled_temperature(
        df_dict, population_scaling
    )

    # Initialize an empty DataFrame to store the "weather" column from each DataFrame
    weather_df = pd.DataFrame()
    # Iterate over each key-value pair in df_dict
    for df_name, df in df_dict.items():
        # Concatenate the "weather" column from the current DataFrame to weather_df
        weather_df[df_name] = df['weather']
    # Create a column containing the weather value with the highest weight
    max_weather = weather_df.apply(
        get_max_weather, axis=1, population_scaling=population_scaling
    )

    # Create a dataframe that combines the scaled temperature
    # and the highest weighted weather value
    df_combined = pd.DataFrame(
        {
            "temperature": scaled_temperature,
            "weather": max_weather
        }
    )

    return df_combined

In [8]:
indices_list = [df.index for df in df_dict.values()]

common_indices = sorted(list(set(indices_list[0]).intersection(*indices_list[1:])))

df_dict = {df_name:df.loc[common_indices] for (df_name,df) in df_dict.items()}

In [9]:
area_population = {
    "crossby_df": 0.9,
    "bingley_df": 0.8,
    "rostherne_df": 2.9,
    "watnall_df": 0.8,
    "coleshill_df": 4.3,
    "heathrow_df": 9.5,
    "thorney_df": 1.5
}
total_population = sum([value for value in area_population.values()])
population_scaling = {key:value/total_population for (key, value) in area_population.items()}
population_scaling


{'crossby_df': 0.04347826086956522,
 'bingley_df': 0.03864734299516909,
 'rostherne_df': 0.14009661835748793,
 'watnall_df': 0.03864734299516909,
 'coleshill_df': 0.20772946859903382,
 'heathrow_df': 0.4589371980676329,
 'thorney_df': 0.07246376811594203}

In [10]:
# Initialize the total sum
# total_sum = pd.Series()
scaled_temperature_all = 0
weather_list = []
# Iterate over each key-value pair in df_dict
for df_name, df in df_dict.items():
    # Multiply the "temperature" column by the scaling factor for the current DataFrame
    scaled_temperature = df['temperature'] * population_scaling[df_name]
    # print(scaled_temperature)
    # Add the scaled temperature to the total sum
    scaled_temperature_all += scaled_temperature

scaled_temperature_all

date
2024-01-26 19:00:00     5.796135
2024-01-26 20:00:00     5.328502
2024-01-26 21:00:00     4.328019
2024-01-26 22:00:00     4.849758
2024-01-26 23:00:00     3.576329
                         ...    
2024-04-09 17:00:00    10.667150
2024-04-09 18:00:00    10.294686
2024-04-09 19:00:00     9.450725
2024-04-09 20:00:00     9.010628
2024-04-09 21:00:00     7.912077
Name: temperature, Length: 1725, dtype: float64

In [11]:
# def get_max_weather_extra_params(weather_vals: pd.Series, population_scaling: dict):
#     weather_aggregates = []
#     unique_weathers = set(weather_vals)
#     for weather in unique_weathers:
#         weather_locations = weather_vals[weather_vals == weather].index.to_list()
#         aggregate = sum([population_scaling[location] for location in weather_locations if location in population_scaling])
#         weather_aggregates.append(aggregate)
#         # print(f"Sum of weights for weather {weather} is {aggregate}")
#     index_max = weather_aggregates.index(max(weather_aggregates))
#     # print(f"Weather to keep is: {list(unique_weathers)[index_max]}")

#     return list(unique_weathers)[index_max]

# def get_max_weather(weather_vals: pd.Series):
#     weather_aggregates = []
#     unique_weathers = set(weather_vals)
#     for weather in unique_weathers:
#         weather_locations = weather_vals[weather_vals == weather].index.to_list()
#         aggregate = sum([population_scaling[location] for location in weather_locations if location in population_scaling])
#         weather_aggregates.append(aggregate)
#         # print(f"Sum of weights for weather {weather} is {aggregate}")
#     index_max = weather_aggregates.index(max(weather_aggregates))
#     # print(f"Weather to keep is: {list(unique_weathers)[index_max]}")

#     return list(unique_weathers)[index_max]


In [13]:
# Initialize an empty DataFrame to store the "weather" column from each DataFrame
weather_df = pd.DataFrame()

# Iterate over each key-value pair in df_dict
for df_name, df in df_dict.items():
    # Concatenate the "weather" column from the current DataFrame to weather_df
    weather_df[df_name] = df['weather']

weather_df["max_weather"] = weather_df.apply(get_max_weather, axis=1, population_scaling=population_scaling)
weather_df.head()

Unnamed: 0_level_0,crossby_df,bingley_df,rostherne_df,watnall_df,coleshill_df,heathrow_df,thorney_df,max_weather_extra_params,max_weather
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-01-26 19:00:00,0,0,0,0,0,0,0,0,0
2024-01-26 20:00:00,0,2,0,0,6,0,0,0,0
2024-01-26 21:00:00,2,0,0,0,0,0,0,0,0
2024-01-26 22:00:00,4,6,0,0,7,0,0,0,0
2024-01-26 23:00:00,4,0,0,2,7,0,4,0,0


In [None]:
weather_df[weather_df["heathrow_df"] != weather_df["max_weather"]]

In [None]:
df_combined = pd.DataFrame(
    {
        "temperature": scaled_temperature_all,
        "weather": weather_df["max_weather"]
    }
)

# Things to do

* Decide which columns are the important columns. Think about what information is available for the predictions
* Remove outliers for the chosen columns and for all datasets
* Once cleaned, it's time for feature engineering:
    * Let's use an weighed-average temperature based on population of the area, e.g. if London has 10% of the population, then london_temp*0.1
    * Try to use an "average" climate condition: e.g. cloudy, sunny... It could be encoded as a number, but it wouldn't make much sense in real life
