# short notebook to deal with data cleanup for Australia's hail and tornado data

In [1]:
# all the imports you should need for this notebook
import pandas as pd
import numpy as np

In [None]:
# This only needs to be run ONCE to fix the broken datetimes in the hail and tornado data.
# It will create new files with the fixed datetimes and a combined file for sharing.
# If you run this again, it will overwrite the existing files, so be careful!

# the combined file is uploaded to the google bucket in the ExtremeWeatherBench project
# and can be accessed at gs://extremeweatherbench/datasets/AustralianLSRData_2020-2024.csv

# fix the broken datetimes in the australian hail data
hail_file = '/home/amy/AustralianHailData_2020-2024_cleaned.csv'
df_hail = pd.read_csv(hail_file, delimiter=',', engine='python', parse_dates=['UTC'], infer_datetime_format=True)
df_hail = df_hail[['UTC', 'Latitude', 'Longitude', 'Nearest town', 'State', "Hail size"]]
# rename the date column to match the tornado data
df_hail.rename(columns={'UTC': 'Date/Time UTC'}, inplace=True)

# Crate a report_type column
df_hail['report_type'] = 'hail'

# save the new file
new_hail_file = '/home/amy/AustralianHailData_2020-2024_cleaned_fixed.csv'
df_hail.to_csv(new_hail_file, index=False)

# fix the broken date-times in the tornado data
tor_file = '/home/amy/AustralianTornadoData_2020-2024_cleaned.csv'
df_tor = pd.read_csv(tor_file, delimiter=',', engine='python', parse_dates=['Date/Time UTC'], infer_datetime_format=True)
df_tor = df_tor[['Date/Time UTC', 'Latitude', 'Longitude', 'Nearest town', 'State', "Fujita scale"]]

# Crate a report_type column
df_tor['report_type'] = 'tor'

# save the new file
new_tor_file = '/home/amy/AustralianTornadoData_2020-2024_cleaned_fixed.csv'
df_tor.to_csv(new_tor_file, index=False)

# create a combined file for sharing
df_combined = pd.concat([df_hail, df_tor], ignore_index=True)
combined_file = '/home/amy/AustralianLSRData_2020-2024.csv'

# save the combined file
df_combined.to_csv(combined_file, index=False)

print ("DONE")
#print the types of the dataframe
print(df_combined.dtypes)

DONE
Date/Time UTC    datetime64[ns]
Latitude                float64
Longitude               float64
Nearest town             object
State                    object
Hail size               float64
report_type              object
Fujita scale            float64
dtype: object


  df_hail = pd.read_csv(hail_file, delimiter=',', engine='python', parse_dates=['UTC'], infer_datetime_format=True)
  df_hail = pd.read_csv(hail_file, delimiter=',', engine='python', parse_dates=['UTC'], infer_datetime_format=True)
  df_tor = pd.read_csv(tor_file, delimiter=',', engine='python', parse_dates=['Date/Time UTC'], infer_datetime_format=True)
  df_tor = pd.read_csv(tor_file, delimiter=',', engine='python', parse_dates=['Date/Time UTC'], infer_datetime_format=True)
