## NND declustering technique

https://github.com/florentaden/nnd_declustering/tree/master

**Data formatting - transforming the columns for running the nnd algo**

This code will transform the CSV data to correct any overflow hours and calculate a continuous _time_ column.

Continuous time in the context of the _time_ column means that the time values represent a continuous sequence starting from 0 for the first event and increasing in a uniform manner. This is typically measured in seconds, but it could be in other units such as minutes, hours, or days, depending on the context. The key point is that it represents the elapsed time from the first event to subsequent events, creating a continuous timeline.

To clarify, here is how the continuous _time_ column is calculated and what it represents:

**1. Start Time**: The first event in the dataset is assigned a time of 0.

**2. Elapsed Time**: Each subsequent event's time is calculated as the difference in seconds (or another unit) from the first event's time.

**3. Uniform Increments**: The _time_ values increase uniformly, providing a continuous measure of time elapsed since the first event

This allows for easy comparison and analysis of events in a temporal sequence.

In [5]:
import pandas as pd

# Load the CSV data
csv_file_path = '/Users/zuhair/Library/CloudStorage/OneDrive-NanyangTechnologicalUniversity/Research/current-projects/strain-rate-seismicity/data/Cleaned_ISC_Data.csv'
csv_data = pd.read_csv(csv_file_path)

# Correct overflow hours by converting them into additional days using pd.to_datetime
def correct_datetime(row):
    # Combine date and time parts
    datetime_str = f"{int(row['year'])}-{int(row['month'])}-{int(row['day'])} {int(row['hour'])}:{int(row['minute'])}:{row['seconde']}"
    try:
        corrected_datetime = pd.to_datetime(datetime_str, errors='coerce')
    except ValueError:
        corrected_datetime = pd.NaT  # Handle invalid dates
    
    return corrected_datetime

# Extract year, month, day from the DATE column
csv_data['DATE'] = pd.to_datetime(csv_data['DATE'])
csv_data['year'] = csv_data['DATE'].dt.year
csv_data['month'] = csv_data['DATE'].dt.month
csv_data['day'] = csv_data['DATE'].dt.day

# Extract hour, minute, and second from the TIME column
time_parts = csv_data['TIME'].str.split(':', expand=True)
csv_data['hour'] = time_parts[0].astype(float)
csv_data['minute'] = time_parts[1].astype(float)
csv_data['seconde'] = time_parts[2].astype(float)

# Apply the correction function
csv_data['corrected_datetime'] = csv_data.apply(correct_datetime, axis=1)

# Drop rows with invalid dates
csv_data = csv_data.dropna(subset=['corrected_datetime'])

# Calculate the elapsed time in seconds from the first event
csv_data['time'] = (csv_data['corrected_datetime'] - csv_data['corrected_datetime'].min()).dt.total_seconds()

# Extract year, month, day, hour, minute, and second from the corrected datetime
csv_data['year'] = csv_data['corrected_datetime'].dt.year
csv_data['month'] = csv_data['corrected_datetime'].dt.month
csv_data['day'] = csv_data['corrected_datetime'].dt.day
csv_data['hour'] = csv_data['corrected_datetime'].dt.hour
csv_data['minute'] = csv_data['corrected_datetime'].dt.minute
csv_data['seconde'] = csv_data['corrected_datetime'].dt.second + csv_data['corrected_datetime'].dt.microsecond / 1e6

# Rename columns to match the Southern California Earthquake Data structure
csv_data.rename(columns={
    'LAT': 'latitude',
    'LON': 'longitude',
    'DEPTH': 'depth',
    'MAG': 'magnitude'
}, inplace=True)

# Select and reorder the columns to match the target structure
target_columns_with_corrected_time = ['year', 'month', 'day', 'hour', 'minute', 'seconde', 'time', 'latitude', 'longitude', 'depth', 'magnitude']
transformed_csv_data_with_corrected_time = csv_data[target_columns_with_corrected_time]

# Display the updated DataFrame to the user
# import ace_tools as tools; tools.display_dataframe_to_user(name="Corrected CSV Data with Continuous Time Column", dataframe=transformed_csv_data_with_corrected_time)

transformed_csv_data_with_corrected_time.head()

  csv_data['DATE'] = pd.to_datetime(csv_data['DATE'])


Unnamed: 0,year,month,day,hour,minute,seconde,time,latitude,longitude,depth,magnitude
1,2006,8,31,14,57,30.0,873127400.0,27.0,97.0,100.0,7.0
6,2014,3,28,10,44,48.0,1112095000.0,25.0,99.0,100.0,6.9
12,2026,10,5,8,19,10.0,1507280000.0,26.0,97.0,80.0,6.2
13,2026,5,29,22,37,32.0,1496185000.0,15.0,92.0,35.0,5.6
14,2027,3,15,16,56,32.0,1521221000.0,24.5,95.0,130.0,6.5


In [7]:
# Save the cleaned DataFrame to a new CSV file
output_file_path = '/Users/zuhair/Library/CloudStorage/OneDrive-NanyangTechnologicalUniversity/Research/current-projects/strain-rate-seismicity/data/transformed_cleaned_data.csv'

transformed_csv_data_with_corrected_time.to_csv(output_file_path, index=False)

print(f"Cleaned data saved to {output_file_path}")

Cleaned data saved to /Users/zuhair/Library/CloudStorage/OneDrive-NanyangTechnologicalUniversity/Research/current-projects/strain-rate-seismicity/data/transformed_cleaned_data.csv


In [13]:
import pandas as pd

# Load the transformed data
file_path = '/Users/zuhair/Library/CloudStorage/OneDrive-NanyangTechnologicalUniversity/Research/current-projects/strain-rate-seismicity/data/transformed_cleaned_data.csv'
transformed_data = pd.read_csv(file_path)

In [None]:
import numpy as np
from mpi4py import MPI
import pandas as pd

# Initialize MPI
doublesize = MPI.DOUBLE.Get_size()

# All CPUs communicator
ALL_COMM = MPI.COMM_WORLD
rank, size = ALL_COMM.Get_rank(), ALL_COMM.Get_size()

# All CPUs inside node communicator
NODE_COMM = ALL_COMM.Split_type(MPI.COMM_TYPE_SHARED)
node_rank, node_size = NODE_COMM.rank, NODE_COMM.size

# Get one relay node per node
value = (rank, node_rank)
values = np.array(ALL_COMM.allgather(value))
relay_ranks = values[values[:, 1] == 0, 0]
group = ALL_COMM.group.Incl(relay_ranks)
RELAY_COMM = ALL_COMM.Create_group(group)

if rank in relay_ranks:
    boss_rank, boss_size = RELAY_COMM.rank, RELAY_COMM.size

# Parameters
b_value = 1
fractal_dimension = 1.6
q = 0.5

# Catalog name
catalog_name = 'transformed_cleaned_data'
output_name = catalog_name + '_nnd'

# Read catalog
catalog_df = transformed_data
catalog_df = catalog_df[catalog_df.magnitude >= 2]
catalog_df = catalog_df.reindex(['year', 'month', 'day', 'hour', 'minute',
    'seconde', 'time', 'latitude', 'longitude', 'depth', 'magnitude'], axis=1)
catalog = np.array(catalog_df)

# Apply declustering and give the children dataframe
children_df = catalog_df[1:]
number_of_children, number_of_columns = children_df.shape

# Shared children array (output)
number_of_points = number_of_children * (number_of_columns + 5)
nbytes = doublesize * number_of_points
win = MPI.Win.Allocate_shared(nbytes if node_rank == 0 else 0, doublesize,
    comm=NODE_COMM)
buf, itemsize = win.Shared_query(0)
assert itemsize == MPI.DOUBLE.Get_size()
buf = np.array(buf, dtype='B', copy=False)
children = np.ndarray(buffer=buf, dtype='d', shape=(number_of_children,
    number_of_columns+5))

if node_rank == 0:
    children.fill(0)
    children[:, :number_of_columns] = np.array(children_df)

ALL_COMM.Barrier()

for j, child in list(enumerate(children))[rank::size]:  # for each event
    # Potential parents
    k = (catalog[:, 6] < child[6])
    parents = np.zeros((k.sum(), number_of_columns+5))
    parents[:, :number_of_columns] = catalog[k]

    # Compute temporal distance with all parents
    parents[:, 11] = child[6] - parents[:, 6]
    parents[:, 11] = parents[:, 11]*10**(-q*b_value*parents[:, 10])

    # Compute physical distance with all parents
    parents[:, 12] = np.sqrt((child[8]-parents[:, 8])**2 + \
                 (child[7]-parents[:, 7])**2)
    parents[:, 12] = (parents[:, 12]**fractal_dimension)*10**(
        (q-1)*b_value*parents[:, 10])

    # Compute nearest_neighbor_distance metric with all parents
    parents[:, 13] = parents[:, 11]*parents[:, 12]

    nearest_neighbor = np.argmin(parents[:, 13])
    children[j, 12] = parents[nearest_neighbor, 12]
    children[j, 11] = parents[nearest_neighbor, 11]
    children[j, 13] = parents[nearest_neighbor, 13]
    children[j, 14] = parents[nearest_neighbor, 10]
    children[j, 15] = nearest_neighbor

ALL_COMM.Barrier()

if rank in relay_ranks:
    if rank == 0:
        target = np.zeros_like(children)
    else:
        target = None

    RELAY_COMM.Reduce([children, MPI.DOUBLE], [target, MPI.DOUBLE], op=MPI.SUM,
        root=0)

if rank == 0:
    columns = ['year', 'month', 'day', 'hour', 'minute', 'seconde', 'time',
              'latitude', 'longitude', 'depth', 'magnitude',
              'Tij', 'Rij', 'Nij', 'parent_magnitude', 'neighbor']
    children_df = pd.DataFrame(target, columns=columns)
    children_df.to_hdf(output_name + '.h5', 'table')