In [1]:
import pandas as pd

# File path to the weather data
weather_file = 'combined_weather_data.csv'
# Load the weather data
weather_data = pd.read_csv(weather_file, low_memory=False)

# Enable or disable debug mode
debug_mode = False  # Set to False to disable debugging outputs

In [2]:
# Ensure 'Date/Time (LST)' exists and convert to datetime
if 'Date/Time (LST)' in weather_data.columns:
    weather_data['Date'] = pd.to_datetime(weather_data['Date/Time (LST)'], errors='coerce').dt.date
else:
    print("Error: 'Date/Time (LST)' column not found in the dataset.")
    exit()

# Determine start and end dates of the filtered data
start_date_weather = weather_data['Date'].min()
end_date_weather = weather_data['Date'].max()

# Generate a complete date range
all_dates_weather = pd.date_range(start=start_date_weather, end=end_date_weather).date

# Identify missing dates
recorded_dates_weather = set(weather_data['Date'])
all_dates_set_weather = set(all_dates_weather)
missing_dates_weather = sorted(all_dates_set_weather - recorded_dates_weather)

# Count missing and overlapping dates
missing_days_count = len(missing_dates_weather)
overlapping_days_count = len(recorded_dates_weather)

# Print results
print(f"Weather Data Covers from {start_date_weather} to {end_date_weather}")
print(f"Total Days in Range: {len(all_dates_weather)}")
print(f"Missing Dates: {missing_days_count}")
print(f"Overlapping Days: {overlapping_days_count}")
print(f"Missing Dates List (first 10): {missing_dates_weather[:10]}")

Weather Data Covers from 2015-01-01 to 2024-12-31
Total Days in Range: 3653
Missing Dates: 0
Overlapping Days: 3653
Missing Dates List (first 10): []


In [3]:
# Display all columns in the dataset
print("Columns in Weather Data:")
print(weather_data.columns)

# Check for missing values in each column
missing_values = weather_data.isnull().sum()
total_rows = len(weather_data)

# Create a DataFrame for a clear overview
missing_summary = pd.DataFrame({
    'Column': weather_data.columns,
    'Missing Values': missing_values,
    'Percentage Missing': (missing_values / total_rows) * 100
})

# Display missing summary
missing_summary.sort_values(by='Percentage Missing', ascending=False, inplace=True)
print("\nMissing Values Summary:")
print(missing_summary)

Columns in Weather Data:
Index(['Longitude (x)', 'Latitude (y)', 'Station Name', 'Climate ID',
       'Date/Time (LST)', 'Year', 'Month', 'Day', 'Time (LST)', 'Temp (°C)',
       'Temp Flag', 'Dew Point Temp (°C)', 'Dew Point Temp Flag',
       'Rel Hum (%)', 'Rel Hum Flag', 'Precip. Amount (mm)',
       'Precip. Amount Flag', 'Wind Dir (10s deg)', 'Wind Dir Flag',
       'Wind Spd (km/h)', 'Wind Spd Flag', 'Visibility (km)',
       'Visibility Flag', 'Stn Press (kPa)', 'Stn Press Flag', 'Hmdx',
       'Hmdx Flag', 'Wind Chill', 'Wind Chill Flag', 'Weather', 'Date'],
      dtype='object')

Missing Values Summary:
                                  Column  Missing Values  Percentage Missing
Precip. Amount (mm)  Precip. Amount (mm)           87672          100.000000
Wind Dir Flag              Wind Dir Flag           87672          100.000000
Wind Chill Flag          Wind Chill Flag           87672          100.000000
Hmdx Flag                      Hmdx Flag           87672          100.0

In [4]:
# Define columns to drop
columns_to_drop = [
    "Longitude (x)", "Latitude (y)", "Station Name", "Climate ID",
    "Precip. Amount (mm)", "Hmdx Flag", "Wind Dir Flag", "Stn Press Flag",
    "Temp Flag", "Wind Chill Flag", "Wind Spd Flag", "Dew Point Temp Flag",
    "Rel Hum Flag", "Visibility Flag", "Precip. Amount Flag", "Hmdx",
    "Wind Chill", "Weather"
]

# Drop the specified columns
weather_data = weather_data.drop(columns=columns_to_drop, errors="ignore")


In [5]:
# Display all columns in the dataset
print("Columns in Weather Data:")
print(weather_data.columns)

# Check for missing values in each column
missing_values = weather_data.isnull().sum()
total_rows = len(weather_data)

# Create a DataFrame for a clear overview
missing_summary = pd.DataFrame({
    'Column': weather_data.columns,
    'Missing Values': missing_values,
    'Percentage Missing': (missing_values / total_rows) * 100
})

# Display missing summary
missing_summary.sort_values(by='Percentage Missing', ascending=False, inplace=True)
print("\nMissing Values Summary:")
print(missing_summary)

# Optional: Export the result to CSV
if debug_mode:
    weather_data.to_csv("justToCheck1.csv", index=False)

Columns in Weather Data:
Index(['Date/Time (LST)', 'Year', 'Month', 'Day', 'Time (LST)', 'Temp (°C)',
       'Dew Point Temp (°C)', 'Rel Hum (%)', 'Wind Dir (10s deg)',
       'Wind Spd (km/h)', 'Visibility (km)', 'Stn Press (kPa)', 'Date'],
      dtype='object')

Missing Values Summary:
                                  Column  Missing Values  Percentage Missing
Wind Dir (10s deg)    Wind Dir (10s deg)            4886            5.573045
Dew Point Temp (°C)  Dew Point Temp (°C)            1208            1.377863
Wind Spd (km/h)          Wind Spd (km/h)            1207            1.376722
Temp (°C)                      Temp (°C)            1206            1.375582
Rel Hum (%)                  Rel Hum (%)            1206            1.375582
Stn Press (kPa)          Stn Press (kPa)            1205            1.374441
Visibility (km)          Visibility (km)            1204            1.373300
Date/Time (LST)          Date/Time (LST)               0            0.000000
Year              

In [6]:
# Copy the filtered dataset to avoid unintended changes
weather_data_cleaned = weather_data.copy()

# Define filling strategy for each column
filling_strategy = {
    "Temp (°C)": "mean",
    "Dew Point Temp (°C)": "mean",
    "Rel Hum (%)": "mean",
    "Visibility (km)": "mean",
    "Wind Dir (10s deg)": "mode",
    "Wind Spd (km/h)": "mean",
    "Stn Press (kPa)": "mean"
}

# Fill missing values based on strategy
for column, strategy in filling_strategy.items():
    if strategy == "mean":
        weather_data_cleaned[column] = weather_data_cleaned[column].fillna(weather_data_cleaned[column].mean())
    elif strategy == "median":
        weather_data_cleaned[column] = weather_data_cleaned[column].fillna(weather_data_cleaned[column].median())
    elif strategy == "mode":
        mode_value = weather_data_cleaned[column].mode()[0]
        weather_data_cleaned[column] = weather_data_cleaned[column].fillna(mode_value)


# Verify if all missing values are handled
missing_summary_after_fill = weather_data_cleaned.isnull().sum()

print("\nMissing Values Summary After Filling:")
if missing_summary_after_fill[missing_summary_after_fill > 0].empty:
    print("All missing values have been filled successfully.")
else:
    print(missing_summary_after_fill[missing_summary_after_fill > 0])

# Display the first few rows of the cleaned dataset
print("\nPreview of the Cleaned Dataset:")
print(weather_data_cleaned.head())

# Optional: Export the result to CSV
if debug_mode:
    weather_data_cleaned.to_csv("justToCheck2.csv", index=False)


Missing Values Summary After Filling:
All missing values have been filled successfully.

Preview of the Cleaned Dataset:
    Date/Time (LST)  Year  Month  Day Time (LST)  Temp (°C)  \
0  2018-11-01 00:00  2018     11    1      00:00        2.9   
1  2018-11-01 01:00  2018     11    1      01:00        2.0   
2  2018-11-01 02:00  2018     11    1      02:00        2.3   
3  2018-11-01 03:00  2018     11    1      03:00        2.5   
4  2018-11-01 04:00  2018     11    1      04:00        3.0   

   Dew Point Temp (°C)  Rel Hum (%)  Wind Dir (10s deg)  Wind Spd (km/h)  \
0                 -2.3         69.0                34.0             32.0   
1                 -1.4         78.0                35.0             34.0   
2                 -2.3         72.0                33.0             31.0   
3                 -2.6         69.0                33.0             31.0   
4                 -3.0         65.0                34.0             36.0   

   Visibility (km)  Stn Press (kPa)       

In [7]:
# Function to assign time periods
def assign_time_period(time_str):
    hour = int(time_str.split(":")[0])
    if 6 <= hour < 12:
        return 0
    elif 12 <= hour < 18:
        return 1
    elif 18 <= hour < 24:
        return 2
    else:
        return 3

# Assign time periods
weather_data_cleaned["Time_Period"] = weather_data_cleaned["Time (LST)"].apply(assign_time_period)

# Select numeric columns for aggregation
numeric_columns = [
    "Temp (°C)", "Dew Point Temp (°C)", "Rel Hum (%)", "Wind Dir (10s deg)",
    "Wind Spd (km/h)", "Visibility (km)", "Stn Press (kPa)"
]

# Group by Date and Time Period
aggregated_weather = (
    weather_data_cleaned.groupby(["Date", "Time_Period"])[numeric_columns]
    .agg({
        "Temp (°C)": "var",  # Variance for temperature
        "Dew Point Temp (°C)": "mean",
        "Rel Hum (%)": "mean",
        "Wind Dir (10s deg)": "mean",
        "Wind Spd (km/h)": "mean",
        "Visibility (km)": "mean",
        "Stn Press (kPa)": "mean"
    })
    .reset_index()
)

# Display a preview of the aggregated data
print("\nPreview of Aggregated Weather Data with Variance for Temp (°C):")
print(aggregated_weather.head())

# Optional: Export the result to CSV
if debug_mode:
    aggregated_weather.to_csv("justToCheck3.csv", index=False)



Preview of Aggregated Weather Data with Variance for Temp (°C):
         Date  Time_Period  Temp (°C)  Dew Point Temp (°C)  Rel Hum (%)  \
0  2015-01-01            0   0.786667            -8.383333    66.000000   
1  2015-01-01            1   0.809667            -5.000000    69.833333   
2  2015-01-01            2   1.049667            -6.133333    83.000000   
3  2015-01-01            3   2.957667            -8.500000    63.166667   
4  2015-01-02            0   2.687000            -6.516667    93.666667   

   Wind Dir (10s deg)  Wind Spd (km/h)  Visibility (km)  Stn Press (kPa)  
0           21.833333         8.666667            44.25        88.925000  
1           22.000000         3.833333            55.00        88.840000  
2           21.000000         4.500000            24.10        88.613333  
3           20.166667         6.833333            24.10        88.985000  
4           11.000000        16.000000             0.70        87.815000  


In [8]:
# Convert the 'Date' column to datetime format
aggregated_weather["Date"] = pd.to_datetime(aggregated_weather["Date"], errors="coerce")

# Drop rows with invalid dates
aggregated_weather = aggregated_weather.dropna(subset=["Date"])

# Add Day of the Week feature (0=Monday, 6=Sunday)
aggregated_weather["Day_Of_Week"] = aggregated_weather["Date"].dt.dayofweek

# Add Weekday/Weekend feature (1=Weekend, 0=Weekday)
aggregated_weather["Is_Weekend"] = aggregated_weather["Day_Of_Week"].apply(lambda x: 1 if x >= 5 else 0)

# Define Canadian statutory holidays for 2017-2019
canadian_holidays = [
    "2017-01-01", "2017-02-20", "2017-04-14", "2017-05-22", "2017-07-01",
    "2017-09-04", "2017-10-09", "2017-11-11", "2017-12-25", "2017-12-26",
    "2018-01-01", "2018-02-19", "2018-03-30", "2018-05-21", "2018-07-01",
    "2018-09-03", "2018-10-08", "2018-11-11", "2018-12-25", "2018-12-26",
    "2019-01-01", "2019-02-18", "2019-04-19", "2019-05-20", "2019-07-01",
    "2019-09-02", "2019-10-14", "2019-11-11", "2019-12-25", "2019-12-26",
]

# Convert the list of holidays to datetime for comparison
canadian_holidays = pd.to_datetime(canadian_holidays)

# Add a holiday indicator
aggregated_weather["Is_Holiday"] = aggregated_weather["Date"].apply(
    lambda x: 1 if x in canadian_holidays else 0
)

# Verify the new features
print(aggregated_weather[["Date", "Is_Holiday"]].query("Is_Holiday == 1").head())

# Save the updated dataset to a new CSV file
aggregated_weather.to_csv("xydata_weather.csv", index=False)
print("Updated Final Dataset saved as 'xydata_weather.csv'.")

           Date  Is_Holiday
2924 2017-01-01           1
2925 2017-01-01           1
2926 2017-01-01           1
2927 2017-01-01           1
3124 2017-02-20           1
Updated Final Dataset saved as 'xydata_weather.csv'.
