In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams["figure.dpi"] = 120
import datetime as dt
import ftplib
import requests
import os
import re

# Type of files and their significance can be found in the doc: 
https://www.ncdc.noaa.gov/swdiws/csv/nx3tvs and https://www.ncdc.noaa.gov/swdiws

1) Hail: events where hail has been recorded
2) Meso and MDA: legacy mesocyclone and Mesocyclone Detection Algorithm. Legacy stops at 2010, and MDA doesn't start until 2007. Maybe use Meso until 2010 and continue with MDA after
3) TVS: Tornadic Vortex Signature Overlay- A product which shows an intense gate to gate azimuthal shear associated with tornadic-scale rotation.
4) Structure: Storm Cell Structure Information
5) nldn: Lightning strikes data. They give the count of lightning strikes within 0.1 in lat and lon of a given region
All the "tiles" files are Lightning strikes data. Maybe what type of event provoqued that strike? 

# Connect and get all the file names

In [2]:
# connect to the server
ftp = ftplib.FTP('ftp.ncdc.noaa.gov', timeout=30) #pass the url without protocol
ftp.login() #pass credentials if anonymous access is not allowed

# switch to the directory containing the data
ftp.cwd('/pub/data/swdi/database-csv/v2/')
ftp.pwd()

httpurl = 'https://www.ncei.noaa.gov/pub/data/swdi/database-csv/v2/'
# get the list of files in this ftp dir
all_files= ftp.nlst()

## Selecting all tornado files

In [3]:
pattern = r"tvs-[0-9]{4}\."
hail_files = [i for i in all_files if bool(re.match(pattern, i))]
hail_files.sort()
print(hail_files)

['tvs-1995.csv.gz', 'tvs-1996.csv.gz', 'tvs-1997.csv.gz', 'tvs-1998.csv.gz', 'tvs-1999.csv.gz', 'tvs-2000.csv.gz', 'tvs-2001.csv.gz', 'tvs-2002.csv.gz', 'tvs-2003.csv.gz', 'tvs-2004.csv.gz', 'tvs-2005.csv.gz', 'tvs-2006.csv.gz', 'tvs-2007.csv.gz', 'tvs-2008.csv.gz', 'tvs-2009.csv.gz', 'tvs-2010.csv.gz', 'tvs-2011.csv.gz', 'tvs-2012.csv.gz', 'tvs-2013.csv.gz', 'tvs-2014.csv.gz', 'tvs-2015.csv.gz', 'tvs-2016.csv.gz', 'tvs-2017.csv.gz', 'tvs-2018.csv.gz', 'tvs-2019.csv.gz', 'tvs-2020.csv.gz', 'tvs-2021.csv.gz', 'tvs-2022.csv.gz', 'tvs-2023.csv.gz']


Create a dictionnary for type of events, and their name in the files. And then create directories for each type to store them

In [4]:
event_types = {'hail':'hail', 'storm_structure':'structure', 'tornados':'tvs', 'lightning':'nldn-tiles', 'mesocyclone':'mda'}
for event in event_types:
    path = '../weather_data/'+event
    if not os.path.exists(path):
        os.mkdir(path)

### Given a year and event type, dowloand and save the file 

In [9]:
def download_file(year, event_type, all_files, httpurl):
    event_name = event_types[event_type]
    pattern = event_name+"-"+str(year)
    file_name = [fname for fname in all_files if pattern in fname]
    if len(file_name) == 0:
        print("No file in that year for that event type") 
        return 
    file_name = file_name[0]
    print("Considering file ", file_name)
    if os.path.exists('../weather_data/{}/{}'.format(event_type, file_name)):
        print("file already exists")
        return
    query_parameters = {"downloadformat": "csv"}
    print("Getting the response from the URL .....")
    response = requests.get(httpurl+file_name, params=query_parameters)
    if response.ok:
        print("Downloaded succesfully")
    with open(r'../weather_data/{}/{}'.format(event_type, file_name), "wb") as f:
        f.write(response.content)
    print('Saved in folder')

In [8]:
download_file(2023, 'tornados', all_files, httpurl)

Considering file  tvs-2023.csv.gz
file already exists


### Short function that reads csv files with for a given year and type

In [11]:
def read_weather(year, event_type):
    files = os.listdir('../weather_data/'+event_type)
    file_name = [fname for fname in files if str(year) in fname]
    if len(file_name) == 0:
        return "No file in that year for that event type"
    if len(file_name) > 1:
        return "Multiple files with that year in their name"
    if event_type == 'lightning':
        return pd.read_csv(r'../weather_data/'+event_type+'/'
                  + file_name[0], skiprows=2, parse_dates=['#ZDAY'])
    return pd.read_csv(r'../weather_data/'+event_type+'/'
                  + file_name[0], skiprows=2, parse_dates=['#ZTIME'])

In [93]:
tornados = read_weather(2023, 'tornados')
tornados.head()

Unnamed: 0,#ZTIME,LON,LAT,WSR_ID,CELL_ID,CELL_TYPE,RANGE,AZIMUTH,AVGDV,LLDV,MXDV,MXDV_HEIGHT,DEPTH,BASE,TOP,MAX_SHEAR,MAX_SHEAR_HEIGHT
0,2023-01-01 00:16:48,-120.53788,36.36179,KHNX,U6,TVS,44,274,39,30,86,9,6.9,5.3,12.2,33,9.2
1,2023-01-01 00:40:51,-119.87524,38.91239,KRGX,D2,TVS,54,201,31,51,51,7,8.5,6.9,15.4,15,6.9
2,2023-01-01 03:23:28,-119.97364,34.86271,KVBX,G7,TVS,21,86,41,63,84,2,5.8,1.5,7.3,65,2.3
3,2023-01-01 04:59:07,-117.62836,34.29869,KEYX,K3,TVS,48,184,35,62,62,6,8.3,6.4,14.7,21,6.4
4,2023-01-01 07:27:29,-117.82177,34.34357,KEYX,G8,TVS,47,196,31,50,50,4,9.1,4.2,13.2,17,4.2


### Get the date, and then group by date and place

In [94]:
tor_copy = tornados.copy()
tor_copy['DATE'] = tor_copy['#ZTIME'].dt.date
groups = tor_copy.groupby(['DATE', 'WSR_ID', 'CELL_ID'])


### Aggregate over max of weather properties and min/max/mean of coordinates

In [96]:
tor_agg = groups.agg({'LAT':['mean', 'min', 'max'], 'LON':['mean', 'min', 'max'], 'AVGDV':'max', 'LLDV':'max', 'MXDV':'max', 'MXDV_HEIGHT':'max', 'DEPTH':'max', 'MAX_SHEAR':'max', 'MAX_SHEAR_HEIGHT':'max'})
tor_agg.columns = tor_agg.columns.to_flat_index()
tor_agg.columns = [col[0]+'_'+col[1] for col in tor_agg.columns]
tor_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LAT_mean,LAT_min,LAT_max,LON_mean,LON_min,LON_max,AVGDV_max,LLDV_max,MXDV_max,MXDV_HEIGHT_max,DEPTH_max,MAX_SHEAR_max,MAX_SHEAR_HEIGHT_max
DATE,WSR_ID,CELL_ID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-01-01,KEMX,I7,32.39305,32.39305,32.39305,-110.68147,-110.68147,-110.68147,36,49,56,7,6.5,29,6.6
2023-01-01,KEMX,L7,32.18141,32.18141,32.18141,-110.52664,-110.52664,-110.52664,52,96,96,2,5.9,86,1.8
2023-01-01,KEYX,G8,34.34357,34.34357,34.34357,-117.82177,-117.82177,-117.82177,31,50,50,4,9.1,17,4.2
2023-01-01,KEYX,K3,34.29869,34.29869,34.29869,-117.62836,-117.62836,-117.62836,35,62,62,6,8.3,21,6.4
2023-01-01,KEYX,S2,35.0269,35.0269,35.0269,-118.24596,-118.24596,-118.24596,39,52,52,4,5.0,24,2.6


In [97]:
tor_agg = tor_agg.reset_index()
tor_agg.head()

Unnamed: 0,DATE,WSR_ID,CELL_ID,LAT_mean,LAT_min,LAT_max,LON_mean,LON_min,LON_max,AVGDV_max,LLDV_max,MXDV_max,MXDV_HEIGHT_max,DEPTH_max,MAX_SHEAR_max,MAX_SHEAR_HEIGHT_max
0,2023-01-01,KEMX,I7,32.39305,32.39305,32.39305,-110.68147,-110.68147,-110.68147,36,49,56,7,6.5,29,6.6
1,2023-01-01,KEMX,L7,32.18141,32.18141,32.18141,-110.52664,-110.52664,-110.52664,52,96,96,2,5.9,86,1.8
2,2023-01-01,KEYX,G8,34.34357,34.34357,34.34357,-117.82177,-117.82177,-117.82177,31,50,50,4,9.1,17,4.2
3,2023-01-01,KEYX,K3,34.29869,34.29869,34.29869,-117.62836,-117.62836,-117.62836,35,62,62,6,8.3,21,6.4
4,2023-01-01,KEYX,S2,35.0269,35.0269,35.0269,-118.24596,-118.24596,-118.24596,39,52,52,4,5.0,24,2.6


In [117]:
tor_agg.to_csv('../weather_data/cleaned/tvs-2023.csv')