# Shear Data
This dataset is necessary for rainband lightning burst detection.

In [1]:
import os
import glob
import pandas as pd
import polars as pl
from scipy.io import loadmat
import numpy as np

We start by importing the filtered trackfile data to extract a list of unique storm codes for storms category 1 or higher. We use this to limit the shear files we process to streamline the process and reduce the workload.

In [2]:
# Import the filtered trackfile for the list of storm codes in analysis
trackfile = pd.read_csv("../data/Filtered_Reduced_Trackfile.csv")
tc_list = trackfile[["storm_code", "storm_name"]].drop_duplicates()
tc_list.reset_index(drop=True, inplace=True)
tc_list.head()

Unnamed: 0,storm_code,storm_name
0,ATL_20_28,Zeta
1,ATL_20_28,Twenty-Eig
2,SHEM_20_4,Sarai
3,SHEM_16_7,Victor
4,ATL_17_8,Gert


In [None]:
# Export filtered TC list for later use
tc_list.to_csv("filtered_tc_list.csv", index=False)

In [None]:
# Restart kernel and import unique value csv
tc_list = pd.read_csv("tc_list.csv")

Using the storm codes, we look for each TC's shear file in the base path (refers to thumb drive). If the TC has a shear file, we append it to the dataframe. We drop rows with null shear values.

In [3]:
# Define base path and list of storm codes
base_path = "/mnt/d/WWLLN_TC_Data_2010_2020/"
storm_codes = tc_list["storm_code"]

In [None]:
# List to store data efficiently
data_list = []
processed_storms = []

# Process each storm code
for storm_code in storm_codes:
    if storm_code in processed_storms:
        continue
    basin, year, number = storm_code.split("_")
    folder_path = os.path.join(base_path, year, basin, number)
    file_pattern = os.path.join(folder_path, "*_Intensity_Shear.mat")
    files = glob.glob(file_pattern)

    if files:
        filename = files[0]  # Pick the first matching file
        mat_data = loadmat(filename)

        # Print keys to inspect the structure
        print(f"Loaded file: {filename}")

        # Create a DataFrame
        if 'cg_IntenShear' in mat_data:
            data_array = mat_data['cg_IntenShear']
            # Convert NumPy array to a Polars DataFrame directly
            df_temp = pd.DataFrame(data_array, columns=['year', 'month', 'day', 'hour', 'min', 'second', 'lat',
              'long', 'distance_east', 'distance_north', 'category', 'intensity_change',
              'trackfile_id', 'shear_magnitude', 'shear_angle'])
            # Drop NaN (null) rows before appending
            df_temp = df_temp.dropna()
            # Add a new column for storm_code
            df_temp["storm_code"] = storm_code
            data_list.append(df_temp)
        else:
            print("Variable not found in .mat file")
    else:
        print(f"{storm_code} does not have a shear file.")

# Efficiently concatenate all Polars DataFrames
shear_data = pd.concat(data_list, ignore_index=True)

ATL_20_28 does not have a shear file.
ATL_20_28 does not have a shear file.
SHEM_20_4 does not have a shear file.
Loaded file: /mnt/d/WWLLN_TC_Data_2010_2020/16/SHEM/7/VictorWWLLN_Intensity_Shear.mat
Loaded file: /mnt/d/WWLLN_TC_Data_2010_2020/17/ATL/8/GertWWLLN_Intensity_Shear.mat
Loaded file: /mnt/d/WWLLN_TC_Data_2010_2020/13/WPAC/12/TramiWWLLN_Intensity_Shear.mat
Loaded file: /mnt/d/WWLLN_TC_Data_2010_2020/16/ATL/5/EarlWWLLN_Intensity_Shear.mat
Loaded file: /mnt/d/WWLLN_TC_Data_2010_2020/15/WPAC/6/NoulWWLLN_Intensity_Shear.mat
WPAC_18_15 does not have a shear file.
SHEM_19_6 does not have a shear file.
Loaded file: /mnt/d/WWLLN_TC_Data_2010_2020/10/ATL/13/KarlWWLLN_Intensity_Shear.mat
WPAC_18_22 does not have a shear file.
Loaded file: /mnt/d/WWLLN_TC_Data_2010_2020/14/ATL/7/FayWWLLN_Intensity_Shear.mat
Loaded file: /mnt/d/WWLLN_TC_Data_2010_2020/17/SHEM/15/ErnieWWLLN_Intensity_Shear.mat
Loaded file: /mnt/d/WWLLN_TC_Data_2010_2020/15/IO/4/ChapalaWWLLN_Intensity_Shear.mat
Loaded file

In [None]:
# Display the final DataFrame
shear_data.head()

Unnamed: 0,year,month,day,hour,min,second,lat,long,distance_east,distance_north,category,intensity_change,trackfile_id,shear_magnitude,shear_angle,storm_code
0,2016.0,1.0,14.0,3.0,0.0,6.4994,-7.2386,-170.1965,-591.885,623.735,0.0,1.0,121.0,221.0,147.0,SHEM_16_7
1,2016.0,1.0,14.0,3.0,0.0,6.6831,-7.1552,-170.2882,-602.111,633.008,0.0,1.0,121.0,221.0,147.0,SHEM_16_7
2,2016.0,1.0,14.0,3.0,0.0,6.8064,-7.2494,-170.2057,-592.885,622.534,0.0,1.0,121.0,221.0,147.0,SHEM_16_7
3,2016.0,1.0,14.0,3.0,0.0,6.9387,-7.2733,-170.1481,-586.501,619.876,0.0,1.0,121.0,221.0,147.0,SHEM_16_7
4,2016.0,1.0,14.0,3.0,0.0,6.719,-7.2224,-170.1888,-591.057,625.536,0.0,1.0,121.0,221.0,147.0,SHEM_16_7


In [6]:
len(shear_data)

29726435

We export the unbinned shear data for future reference.

In [7]:
# Export the unbinned shear data as a tab-separated txt file
shear_data.to_csv("unbinned_shear_data.txt", sep="\t", index=False)

We can restart the kernel here and read in the data again to free up space.

In [None]:
# Import file
shear_data = pl.read_csv("unbinned_shear_data.txt", separator="\t")

In [4]:
shear_data.head(20)

year,month,day,hour,min,second,lat,long,distance_east,distance_north,category,intensity_change,trackfile_id,shear_magnitude,shear_angle,storm_code
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2016.0,1.0,14.0,3.0,0.0,6.4994,-7.2386,-170.1965,-591.885,623.735,0.0,1.0,121.0,221.0,147.0,"""SHEM_16_7"""
2016.0,1.0,14.0,3.0,0.0,6.6831,-7.1552,-170.2882,-602.111,633.008,0.0,1.0,121.0,221.0,147.0,"""SHEM_16_7"""
2016.0,1.0,14.0,3.0,0.0,6.8064,-7.2494,-170.2057,-592.885,622.534,0.0,1.0,121.0,221.0,147.0,"""SHEM_16_7"""
2016.0,1.0,14.0,3.0,0.0,6.9387,-7.2733,-170.1481,-586.501,619.876,0.0,1.0,121.0,221.0,147.0,"""SHEM_16_7"""
2016.0,1.0,14.0,3.0,0.0,6.719,-7.2224,-170.1888,-591.057,625.536,0.0,1.0,121.0,221.0,147.0,"""SHEM_16_7"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2016.0,1.0,14.0,3.0,0.0,49.8983,-5.3808,-159.2284,620.206,830.313,0.0,1.0,121.0,221.0,147.0,"""SHEM_16_7"""
2016.0,1.0,14.0,3.0,0.0,49.9636,-5.4211,-159.3323,608.664,825.831,0.0,1.0,121.0,221.0,147.0,"""SHEM_16_7"""
2016.0,1.0,14.0,3.0,1.0,10.8907,-7.305,-167.6051,-305.989,616.351,0.0,1.0,121.0,221.0,147.0,"""SHEM_16_7"""
2016.0,1.0,14.0,3.0,1.0,14.4077,-9.4506,-160.2372,503.844,377.771,0.0,1.0,121.0,221.0,147.0,"""SHEM_16_7"""


Assuming each row is a WWLLN lightning event, we can drop the lightning-related columns to free up space and deduplicate. We only want the shear data for now. The lightning events are retained in the separate lightning dataset.

In [None]:
# Assuming each row is actually a lightning event - we don't need the lat/long/distance columns
shear_data = shear_data.drop(["second", "lat", "long", "distance_east", "distance_north", "category", "intensity_change", "trackfile_id"])

In [None]:
# Write out to csv again
shear_data = shear_data.to_pandas()
shear_data.to_csv("unbinned_shear_data_temp.txt", sep="\t")

Once again, we write out the data to a csv to free up space. Restart the kernel and run the import cell again. Next, we want to deduplicate the rows to get unique values for each minute. We'll use these to create 30-minute timebins later.

In [2]:
shear_data_lazy = pl.scan_csv("unbinned_shear_data_temp.txt", separator="\t")

# You can remove duplicates as you process chunks
shear_data_lazy_dedup = shear_data_lazy.unique()

# Collect the result (i.e., execute the lazy computation)
shear_data_dedup = shear_data_lazy_dedup.collect()

shear_data_dedup.head(20)

: 

In [2]:
# restart kernel bc cry and try again
shear_data = pl.read_csv("unbinned_shear_data_temp.txt", separator="\t")

In [3]:
shear_data_dedup = shear_data.unique()

: 

bin the data by 30 min bins?

In [9]:
# Define a function to apply the 30-minute binning for each storm_code group
def add_time_bin(group):
    group['time_bin'] = group['datetime'].dt.floor('30T')
    return group

In [10]:
# Create a datetime column from the existing columns
# Ensure sec column is valid
shear_data['second'] = shear_data['second'].apply(lambda x: 0 if x == 60 else x)

# Create a datetime column
shear_data['datetime'] = pd.to_datetime(
    shear_data['year'].astype(str) + '-' +
    shear_data['month'].astype(str).str.zfill(2) + '-' +
    shear_data['day'].astype(str).str.zfill(2) + ' ' +
    shear_data['hour'].astype(str).str.zfill(2) + ':' +
    shear_data['min'].astype(str).str.zfill(2) + ':' +
    shear_data['second'].astype(str).str.zfill(2)
)

# Group by storm_code and apply the binning function
shear_data = shear_data.groupby('storm_code').apply(add_time_bin)

# Group by bins and get the average value per 30-minute bin
shear_data_binned = shear_data.groupby(['storm_code', 'time_bin'])
shear_data_binned = shear_data.groupby(['storm_code', 'time_bin'])[['shear_magnitude', 'shear_angle']].mean().reset_index()

shear_data_binned = shear_data_binned.sort_values(by = ['storm_code', 'time_bin'])
shear_data_binned.head()
# export both the ungrouped and grouped datasets to csv files
# locations_WWLLN_filtered_innercore.to_csv("WWLLN_innercore_w_time.csv", index=False)
# locations_WWLLN_filtered_innercore_timebin.to_csv("WWLLN_innercore_timebin_count.csv", index=False)

: 