In [1]:
# all the imports you should need for this notebook
import pandas as pd
from datetime import datetime

In [None]:
# This only needs to be run ONCE to fix the broken datetimes in the hail and tornado data.
# It will create new files with the fixed datetimes and a combined file for sharing.
# If you run this again, it will overwrite the existing files, so be careful!

# the combined file is uploaded to the google bucket in the ExtremeWeatherBench project
# and can be accessed at gs://extremeweatherbench/datasets/AustralianLSRData_2020-2024.csv

def parse_datetime(row):
    """
    Parses date and time components from a DataFrame row into a single datetime object.
    
    Args:
        row: A pandas Series representing a row of the DataFrame.

    Returns:
        A datetime object.
    """
    # Assuming a fixed year for simplicity, adjust as needed.
    padded_day = str(row['Day']).zfill(2)
    padded_month = str(row['Month']).zfill(2)
    padded_time = str(row['Time (UTC)']).zfill(4)
    date_string = f"{padded_month}/{padded_day}/{row['Year']} {padded_time}"
    #print(f"Parsing date string: {date_string}")  # Debugging line
    return datetime.strptime(date_string, '%m/%d/%Y %H%M')


# # fix the broken datetimes in the australian hail data
# hail_file = "/home/amy/NHP_edited.csv"
# df_hail = pd.read_csv(hail_file, delimiter=',', engine='python', parse_dates=['Date & Time of Observation'], infer_datetime_format=True)

# # rename the date column to match the tornado data
# df_hail.rename(columns={'x': 'Longitude', 'y': 'Latitude'}, inplace=True)
# df_hail.rename(columns={'Date & Time of Observation': 'Date/Time UTC'}, inplace=True)
# df_hail = df_hail[['Date/Time UTC', 'Latitude', 'Longitude', 'Event Name', "Maximum Hail Dimension (mm)"]]

# # Crate a report_type column
# df_hail['report_type'] = 'hail'

# # save the new file
# new_hail_file = '/home/amy/NHP_2020-2024_cleaned_fixed.csv'
# df_hail.to_csv(new_hail_file, index=False)

# read in the second hail report file
hail_file2 = "/home/amy/integrated_canadian_hail_db_v1.1_cleaned.csv"
df_hail2 = pd.read_csv(hail_file2, delimiter=',', engine='python', parse_dates=['Start Time'], infer_datetime_format=True)

# rename the date column to match the tornado data
df_hail2.rename(columns={'Start Time': 'Date/Time UTC', 'Hail Diameter (mm)': 'Maximum Hail Dimension (mm)'}, inplace=True)
df_hail2 = df_hail2[['Date/Time UTC', 'Latitude', 'Longitude', 'Reference Object', 'Maximum Hail Dimension (mm)']]

# Crate a report_type column
df_hail2['report_type'] = 'hail'

# save the new file
new_hail_file2 = '/home/amy/integrated_canadian_hail_db_2020-2022_fixed.csv'
df_hail2.to_csv(new_hail_file2, index=False)

# create a new hail dataframe with only severe reports (>= 20 mm)
df_hail_combined = pd.concat([df_hail2], ignore_index=True)
df_hail_severe = df_hail_combined[df_hail_combined['Maximum Hail Dimension (mm)'] >= 20]

# save the new file
new_hail_severe_file = '/home/amy/NHP_2020-2024_severe_fixed.csv'
df_hail_severe.to_csv(new_hail_severe_file, index=False)

# fix the broken date-times in the tornado data
tor_file = "/home/amy/NTP_edited2.csv"

# parse the dates from the Day	Month	Time (UTC) columns
df_tor = pd.read_csv(tor_file, delimiter=',', engine='python')
# Apply the function to create a new 'Datetime' column
df_tor['Date/Time UTC'] = df_tor.apply(parse_datetime, axis=1)
df_tor.rename(columns={'x': 'Longitude', 'y': 'Latitude'}, inplace=True)

df_tor = df_tor[['Date/Time UTC', 'Latitude', 'Longitude', 'Event Name', 'Event Type', "Damage"]]

# subselect only the torando events on the ground (no waterspouts etc)
df_tor = df_tor[df_tor['Event Type'] == 'tornado_over_land']

# Crate a report_type column
df_tor['report_type'] = 'tor'

# save the new file
new_tor_file = '/home/amy/CanadaTor2020-2024_cleaned_fixed.csv'
df_tor.to_csv(new_tor_file, index=False)

# create a combined file for sharing
df_combined = pd.concat([df_hail_severe, df_tor], ignore_index=True)
combined_file = '/home/amy/CanadaLSRData_2020-2024.csv'

# save the combined file
df_combined.to_csv(combined_file, index=False)

print ("DONE")
#print the types of the dataframe
print(df_combined.dtypes)