In [3]:
import numpy as np
import pandas as pd
from scipy.fft import fft, fftfreq
from obspy.signal.trigger import classic_sta_lta, trigger_onset
from obspy import read
from datetime import datetime
from scipy.signal import find_peaks

# Load the miniSEED file
def load_mseed(file_path):
    st = read(file_path)
    tr = st.traces[0].copy()
    return tr

# Apply a bandpass filter to the trace
def apply_bandpass_filter(trace, min_freq, max_freq):
    tr_filt = trace.copy()
    tr_filt.filter('bandpass', freqmin=min_freq, freqmax=max_freq)
    return tr_filt

# Get the arrival time from the trace
def get_arrival(tr, time):
    starttime = tr.stats.starttime.datetime
    return (time - starttime).total_seconds()

# Function to compute STA/LTA with time
def compute_sta_lta_with_time(trace, sta_len, lta_len):
    sampling_rate = trace.stats.sampling_rate
    cft = classic_sta_lta(trace.data, int(sta_len * sampling_rate), int(lta_len * sampling_rate))
    
    max_sta_lta = np.max(cft)
    max_sta_lta_idx = np.argmax(cft)
    
    # Return the time in relative seconds
    max_sta_lta_time = max_sta_lta_idx / sampling_rate
    
    return cft, max_sta_lta, max_sta_lta_time

# Function to detect STA/LTA events
def detect_sta_lta_events(trace, sta_len, lta_len):
    thr_on = 2.0
    thr_off = 1.0
    cft, _, _ = compute_sta_lta_with_time(trace, sta_len, lta_len)
    on_off = trigger_onset(cft, thr_on, thr_off)

    if len(on_off) > 0:
        start_idx, end_idx = on_off[0]
        start_time = start_idx / trace.stats.sampling_rate
        end_time = end_idx / trace.stats.sampling_rate
        return start_time, end_time
    return None, None

# Split trace into 120-second batches
def split_into_batches(trace, batch_duration_sec=120):
    sampling_rate = trace.stats.sampling_rate
    batch_size = int(batch_duration_sec * sampling_rate)
    total_length = len(trace.data)

    # Create a list of batches
    batches = []
    for i in range(0, total_length, batch_size):
        batch_data = trace.data[i:i + batch_size]
        if len(batch_data) == batch_size:
            batch_trace = trace.copy()
            batch_trace.data = batch_data
            batch_trace.stats.starttime += i / sampling_rate
            batches.append(batch_trace)
    return batches

# Function to detect Zero-Crossing Rate (ZCR) threshold events in batches
def detect_zcr_threshold_in_batches(trace, zcr_threshold):
    batches = split_into_batches(trace)
    for batch in batches:
        zcr_exceed_time = detect_zcr_threshold_events(batch, zcr_threshold)
        if zcr_exceed_time is not None:
            return (batch.stats.starttime - trace.stats.starttime) + zcr_exceed_time
    return None

# Function to detect ZCR threshold events (individual batch)
def detect_zcr_threshold_events(trace, zcr_threshold):
    data = trace.data
    sampling_rate = trace.stats.sampling_rate
    zero_crossings = np.where(np.diff(np.sign(data)))[0]
    zcr = len(zero_crossings) / len(data)

    if zcr > zcr_threshold and len(zero_crossings) > 0:
        zcr_exceed_time = zero_crossings[0] / sampling_rate
        return zcr_exceed_time
    return None

# Function to detect when the cumulative energy exceeds a threshold in batches
def detect_energy_threshold_in_batches(trace, energy_threshold):
    batches = split_into_batches(trace)
    for batch in batches:
        energy_exceed_time = detect_energy_threshold_events(batch, energy_threshold)
        if energy_exceed_time is not None:
            return (batch.stats.starttime - trace.stats.starttime) + energy_exceed_time
    return None

# Function to detect energy threshold events (individual batch)
def detect_energy_threshold_events(trace, energy_threshold):
    data = trace.data
    cumulative_energy = np.cumsum(data**2) / np.max(np.cumsum(data**2))

    exceed_indices = np.where(cumulative_energy > energy_threshold)[0]
    
    if len(exceed_indices) > 0:
        energy_exceed_time = exceed_indices[0] / trace.stats.sampling_rate
        return energy_exceed_time
    return None

# Function to detect amplitude spikes in batches
def detect_amplitude_spikes_in_batches(trace, amp_threshold):
    batches = split_into_batches(trace)
    for batch in batches:
        spike_time = detect_amplitude_spikes(batch, amp_threshold)
        if spike_time is not None:
            return (batch.stats.starttime - trace.stats.starttime) + spike_time
    return None

# Function to detect amplitude spikes (individual batch)
def detect_amplitude_spikes(trace, amp_threshold):
    data = trace.data
    sampling_rate = trace.stats.sampling_rate
    spike_indices = np.where(np.abs(data) > amp_threshold)[0]

    if len(spike_indices) > 0:
        spike_time = spike_indices[0] / sampling_rate
        return spike_time
    return None

def highest_average_amplitude(mseed_file, batch_duration_sec=180):
    st = read(mseed_file)
    trace = st.traces[0].copy()
    sampling_rate = trace.stats.sampling_rate
    batch_size = int(batch_duration_sec * sampling_rate)
    total_length = len(trace.data)

    max_average_amplitude = None
    time_of_max_amplitude = None

    for i in range(0, total_length, batch_size):
        batch_data = trace.data[i:i + batch_size]
        if len(batch_data) == batch_size:
            avg_amplitude = np.mean(np.abs(batch_data))
            if max_average_amplitude is None or avg_amplitude > max_average_amplitude:
                max_average_amplitude = avg_amplitude
                time_of_max_amplitude = i / sampling_rate
    return max_average_amplitude, time_of_max_amplitude

# Function to extract STA/LTA features with time
def extract_sta_lta_features_with_time(trace, sta_len, lta_len):
    cft, max_sta_lta, max_sta_lta_time = compute_sta_lta_with_time(trace, sta_len, lta_len)
    mean_sta_lta = np.mean(cft)
    var_sta_lta = np.var(cft)
    return max_sta_lta, max_sta_lta_time, mean_sta_lta, var_sta_lta

# Function to extract amplitude features with min and max times
def extract_time_domain_features_with_max_min_time(trace):
    data = trace.data
    sampling_rate = trace.stats.sampling_rate
    mean_amp = np.mean(data)
    rms_amp = np.sqrt(np.mean(data**2))
    sorted_amplitudes = np.argsort(data)
    max_amp_idx = sorted_amplitudes[-1]
    second_max_amp_idx = sorted_amplitudes[-2]
    max_amp = data[max_amp_idx]
    second_max_amp = data[second_max_amp_idx]
    min_amp = np.min(data)
    min_amp_idx = np.argmin(data)
    max_amp_time = max_amp_idx / sampling_rate
    second_max_amp_time = second_max_amp_idx / sampling_rate
    min_amp_time = min_amp_idx / sampling_rate
    return mean_amp, max_amp, max_amp_time, second_max_amp, second_max_amp_time, min_amp, min_amp_time, rms_amp



# Function to extract event duration features
def extract_event_duration(trace, sta_len, lta_len):
    thr_on = 4.0
    thr_off = 1.5
    cft = compute_sta_lta_with_time(trace, sta_len, lta_len)[0]
    on_off = trigger_onset(cft, thr_on, thr_off)
    if len(on_off) > 0:
        event_durations = [(end - start) / trace.stats.sampling_rate for start, end in on_off]
        return np.mean(event_durations) if event_durations else 0
    return 0

# Main function to extract features from the catalog
def extract_features_for_catalog(cat_file_path, data_directory):
    
    features_list = []
    
    try:
        filename = data_directory
        mseed_file = f'{cat_file_path}.mseed'
            
        if load_mseed(mseed_file):
            tr = load_mseed(mseed_file)
            tr_filt = apply_bandpass_filter(tr, 0.5, 1.0)
            max_sta_lta, max_sta_lta_time, mean_sta_lta, var_sta_lta = extract_sta_lta_features_with_time(tr_filt, 0.5, 10)
            mean_amp, max_amp, max_amp_time, second_max_amp, second_max_amp_time, min_amp, min_amp_time, rms_amp = extract_time_domain_features_with_max_min_time(tr)

            zcr_threshold_time = detect_zcr_threshold_in_batches(tr_filt, zcr_threshold=0.1)
            energy_threshold_time = detect_energy_threshold_in_batches(tr_filt, energy_threshold=0.7)
            amp_spike_time = detect_amplitude_spikes_in_batches(tr_filt, amp_threshold=0.4)
            highest_avg_amplitude1, time_of_occurrence1 = highest_average_amplitude(mseed_file, batch_duration_sec=200)
            highest_avg_amplitude2, time_of_occurrence2 = highest_average_amplitude(mseed_file, batch_duration_sec=475)


                # STA/LTA event detection
            sta_lta_start_time, sta_lta_end_time = detect_sta_lta_events(tr_filt, sta_len=200, lta_len=475)


            features_list.append({
                'filename': filename,
                'max_sta_lta': max_sta_lta,
                'max_sta_lta_time': max_sta_lta_time,  # In relative seconds
                'mean_sta_lta': mean_sta_lta,
                'var_sta_lta': var_sta_lta,
                'mean_amp': mean_amp,
                'max_amp': max_amp,
                'max_amp_time': max_amp_time,  # In relative seconds
                'second_max_amp': second_max_amp,
                'second_max_amp_time': second_max_amp_time,  # In relative seconds
                'min_amp': min_amp,
                'min_amp_time': min_amp_time,  # In relative seconds
                'rms_amp': rms_amp,
                'zcr_threshold_time': zcr_threshold_time,  # In relative seconds
                'energy_threshold_time': energy_threshold_time,  # In relative seconds
                'amp_spike_time': amp_spike_time,  # In relative seconds
                'highest_avg_amplitude1': highest_avg_amplitude1,
                'time_of_occurrence1': time_of_occurrence1,  # In relative seconds
                'highest_avg_amplitude2': highest_avg_amplitude2,
                'time_of_occurrence2': time_of_occurrence2,  # In relative seconds
                'sta_lta_start_time': sta_lta_start_time,  # In relative seconds
                'sta_lta_end_time': sta_lta_end_time  # In relative seconds
            })
                
    except Exception as e:
        print(f'Error processing row {i}: {e}')
    
    return pd.DataFrame(features_list)


In [None]:
import os
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the initial dataset and preprocess
df = pd.read_csv("Features.csv")
df = df.dropna(axis=1)
features_df_clean = df.drop(columns=['arrival','zcr_threshold_time', 'sta_lta_start_time'])
scaler = StandardScaler()
features_df_scaled = scaler.fit_transform(features_df_clean)
X = features_df_scaled
y_absolute = df['arrival']  
X_train_abs, X_test_abs, y_train_abs, y_test_abs = train_test_split(X, y_absolute, test_size=0.2, random_state=42)

# Define the path to the data directory
data_dir = r'C:\Users\akshi\Machine Learning Projects\Space Apps Challenge\data\lunar\test\data'

# Initialize an empty list to store the concatenated DataFrame
all_features = []

# Recursively find all .mseed files in the subdirectories
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith('.mseed'):
            # Get the full filepath without the .mseed extension
            filepath_without_extension = os.path.join(root, os.path.splitext(file)[0])
            
            # Extract features for the current .mseed file
            features_df = extract_features_for_catalog(filepath_without_extension, os.path.splitext(file)[0])
            
            # Append the features for the current file to the main list
            all_features.append(features_df)

# Concatenate all extracted feature data into a single DataFrame
final_dataset = pd.concat(all_features, ignore_index=True)

# Prepare the input features for prediction by removing columns not used for prediction
X_test = final_dataset.drop(columns=['filename', 'zcr_threshold_time', 'sta_lta_start_time']).dropna(axis=1).values

# Train the KNN model and make predictions using the dataset
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_abs, y_train_abs)  # Ensure X_train_abs and y_train_abs are defined elsewhere in your script
predictions = knn_model.predict(X_test)

# Store the results with filename and the predicted time_rel values
results = []
for i, pred in enumerate(predictions):
    results.append({
        'filename': final_dataset['filename'].iloc[i],  # Use the corresponding filename
        'time_rel': pred
    })

# Create a DataFrame from the results
output_df = pd.DataFrame(results)

In [56]:
final_dataset

Unnamed: 0,filename,max_sta_lta,max_sta_lta_time,mean_sta_lta,var_sta_lta,mean_amp,max_amp,max_amp_time,second_max_amp,second_max_amp_time,...,rms_amp,zcr_threshold_time,energy_threshold_time,amp_spike_time,highest_avg_amplitude1,time_of_occurrence1,highest_avg_amplitude2,time_of_occurrence2,sta_lta_start_time,sta_lta_end_time
0,xa.s12.00.mhz.1969-12-16HR00_evid00006,21.607633,34359.849057,1.302176,2.751604,-5.366267e-13,3.502370e-09,1925.735849,3.158105e-09,1873.660377,...,2.224150e-10,0.000000,109.283019,,9.383250e-10,1800.0,7.524711e-10,1899.471698,474.716981,1002.716981
1,xa.s12.00.mhz.1970-01-09HR00_evid00007,21.994447,37582.037736,1.231040,2.836403,1.038720e-12,3.844195e-09,77571.018868,3.534276e-09,77569.962264,...,2.554970e-10,0.603774,113.056604,,6.422508e-10,75600.0,5.591219e-10,7597.886792,474.716981,1765.283019
2,xa.s12.00.mhz.1970-02-07HR00_evid00014,21.069734,53580.377358,1.110658,1.595738,-2.010854e-12,1.111206e-08,68167.094340,1.102741e-08,68167.245283,...,3.669408e-10,0.603774,108.528302,,1.979431e-09,68000.0,9.618759e-10,67906.113208,474.716981,1393.207547
3,xa.s12.00.mhz.1970-02-18HR00_evid00016,21.442240,23699.018868,1.177809,2.341165,6.435200e-13,7.313207e-09,52925.283019,7.278236e-09,52944.603774,...,4.277362e-10,0.603774,108.679245,,1.529351e-09,44600.0,1.492483e-09,44637.584906,474.716981,1214.943396
4,xa.s12.00.mhz.1970-03-14HR00_evid00018,20.801690,74701.584906,1.191739,2.341085,1.082810e-12,2.448268e-09,45678.339623,2.399423e-09,45671.547170,...,2.350381e-10,0.603774,105.962264,,6.132671e-10,45600.0,5.119293e-10,44162.716981,474.716981,1616.150943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,xa.s16.00.mhz.1977-04-17HR00_evid00249,21.576572,79461.433962,1.130128,1.813773,-3.205922e-14,7.390628e-09,85452.679245,7.103239e-09,85628.679245,...,3.024341e-10,0.603774,117.283019,,2.004712e-09,85400.0,1.492130e-09,85476.226415,474.716981,1709.433962
92,xa.s16.00.mhz.1977-06-02HR00_evid00255,21.484198,60075.169811,1.122312,1.778231,-4.263289e-12,6.727760e-08,63309.433962,6.674749e-08,63309.283019,...,1.258833e-09,0.603774,105.962264,,9.709842e-09,52200.0,8.107811e-09,52235.471698,474.716981,1461.433962
93,xa.s16.00.mhz.1973-08-25HR00_evid00443,20.941993,41033.207547,1.131335,1.849867,-1.882280e-12,2.788716e-09,73211.169811,2.787885e-09,73211.018868,...,2.952137e-10,0.603774,104.000000,,6.698406e-10,30600.0,4.476122e-10,37514.566038,474.716981,1744.452830
94,xa.s16.00.mhz.1973-12-18HR00_evid00487,21.077222,15788.377358,2.638555,7.940278,1.655550e-13,2.376602e-09,76632.452830,2.355352e-09,76497.660377,...,1.975482e-10,0.000000,118.490566,,6.096911e-10,76400.0,5.955764e-10,76453.735849,749.283019,1141.132075


In [54]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
knn_model = RandomForestRegressor(n_estimators=200, random_state=42)
knn_model.fit(X_train_abs, y_train_abs)  # Ensure X_train_abs and y_train_abs are defined elsewhere in your script
predictions = knn_model.predict(X_test)

# Store the results with filename and the predicted time_rel values
results = []
for i, pred in enumerate(predictions):
    results.append({
        'filename': final_dataset['filename'].iloc[i],  # Use the corresponding filename
        'time_rel': pred
    })

# Create a DataFrame from the results
output_df = pd.DataFrame(results)
output_df

Unnamed: 0,filename,time_rel
0,xa.s12.00.mhz.1969-12-16HR00_evid00006,82293.261665
1,xa.s12.00.mhz.1970-01-09HR00_evid00007,82293.261665
2,xa.s12.00.mhz.1970-02-07HR00_evid00014,82293.261665
3,xa.s12.00.mhz.1970-02-18HR00_evid00016,82293.261665
4,xa.s12.00.mhz.1970-03-14HR00_evid00018,82293.261665
...,...,...
91,xa.s16.00.mhz.1977-04-17HR00_evid00249,82293.261665
92,xa.s16.00.mhz.1977-06-02HR00_evid00255,82293.261665
93,xa.s16.00.mhz.1973-08-25HR00_evid00443,82293.261665
94,xa.s16.00.mhz.1973-12-18HR00_evid00487,82293.261665


In [48]:
output_df.to_csv('output_catalog.csv', index=False)

In [45]:
final_dataset

Unnamed: 0,filename,max_sta_lta,max_sta_lta_time,mean_sta_lta,var_sta_lta,mean_amp,max_amp,max_amp_time,second_max_amp,second_max_amp_time,...,rms_amp,zcr_threshold_time,energy_threshold_time,amp_spike_time,highest_avg_amplitude1,time_of_occurrence1,highest_avg_amplitude2,time_of_occurrence2,sta_lta_start_time,sta_lta_end_time
0,xa.s12.00.mhz.1969-12-16HR00_evid00006,21.607633,34359.849057,1.302176,2.751604,-5.366267e-13,3.502370e-09,1925.735849,3.158105e-09,1873.660377,...,2.224150e-10,0.000000,109.283019,,9.383250e-10,1800.0,7.524711e-10,1899.471698,474.716981,1002.716981
1,xa.s12.00.mhz.1970-01-09HR00_evid00007,21.994447,37582.037736,1.231040,2.836403,1.038720e-12,3.844195e-09,77571.018868,3.534276e-09,77569.962264,...,2.554970e-10,0.603774,113.056604,,6.422508e-10,75600.0,5.591219e-10,7597.886792,474.716981,1765.283019
2,xa.s12.00.mhz.1970-02-07HR00_evid00014,21.069734,53580.377358,1.110658,1.595738,-2.010854e-12,1.111206e-08,68167.094340,1.102741e-08,68167.245283,...,3.669408e-10,0.603774,108.528302,,1.979431e-09,68000.0,9.618759e-10,67906.113208,474.716981,1393.207547
3,xa.s12.00.mhz.1970-02-18HR00_evid00016,21.442240,23699.018868,1.177809,2.341165,6.435200e-13,7.313207e-09,52925.283019,7.278236e-09,52944.603774,...,4.277362e-10,0.603774,108.679245,,1.529351e-09,44600.0,1.492483e-09,44637.584906,474.716981,1214.943396
4,xa.s12.00.mhz.1970-03-14HR00_evid00018,20.801690,74701.584906,1.191739,2.341085,1.082810e-12,2.448268e-09,45678.339623,2.399423e-09,45671.547170,...,2.350381e-10,0.603774,105.962264,,6.132671e-10,45600.0,5.119293e-10,44162.716981,474.716981,1616.150943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,xa.s16.00.mhz.1977-04-17HR00_evid00249,21.576572,79461.433962,1.130128,1.813773,-3.205922e-14,7.390628e-09,85452.679245,7.103239e-09,85628.679245,...,3.024341e-10,0.603774,117.283019,,2.004712e-09,85400.0,1.492130e-09,85476.226415,474.716981,1709.433962
92,xa.s16.00.mhz.1977-06-02HR00_evid00255,21.484198,60075.169811,1.122312,1.778231,-4.263289e-12,6.727760e-08,63309.433962,6.674749e-08,63309.283019,...,1.258833e-09,0.603774,105.962264,,9.709842e-09,52200.0,8.107811e-09,52235.471698,474.716981,1461.433962
93,xa.s16.00.mhz.1973-08-25HR00_evid00443,20.941993,41033.207547,1.131335,1.849867,-1.882280e-12,2.788716e-09,73211.169811,2.787885e-09,73211.018868,...,2.952137e-10,0.603774,104.000000,,6.698406e-10,30600.0,4.476122e-10,37514.566038,474.716981,1744.452830
94,xa.s16.00.mhz.1973-12-18HR00_evid00487,21.077222,15788.377358,2.638555,7.940278,1.655550e-13,2.376602e-09,76632.452830,2.355352e-09,76497.660377,...,1.975482e-10,0.000000,118.490566,,6.096911e-10,76400.0,5.955764e-10,76453.735849,749.283019,1141.132075


In [None]:
import os
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("Features_New_Expanded_Final.csv")
df = df.dropna(axis=1)
features_df_clean = df.drop(columns=['arrival'])
scaler = StandardScaler()
features_df_scaled = scaler.fit_transform(features_df_clean)
X = features_df_scaled  
y_absolute = df['arrival']  
X_train_abs, X_test_abs, y_train_abs, y_test_abs = train_test_split(X, y_absolute, test_size=0.2, random_state=42)

# Standardize features (zero mean and unit variance)
scaler = StandardScaler()
features_df_scaled = scaler.fit_transform(features_df_clean)

# Define the path to the data directory
data_dir = r'C:\\Users\\akshi\\Machine Learning Projects\\Space Apps Challenge\\data\\lunar\\test\\data'

# Initialize an empty list to store the concatenated DataFrame
all_features = []

# Recursively find all .mseed files in the subdirectories
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith('.mseed'):
            filepath = os.path.join(root, file)
            filename = os.path.splitext(file)[0]  # Get filename without extension
            
            # Assuming there's a way to map filenames to corresponding catalog information
            # Create a small dummy catalog row for each file (adjust based on actual catalog structure)
            catalog_data = pd.DataFrame([{
                'filename': filename
            }])

            # Save the catalog data to a temporary CSV file
            temp_catalog_path = os.path.join(root, 'temp_catalog.csv')
            catalog_data.to_csv(temp_catalog_path, index=False)

            # Extract features for the current file
            features_df = extract_features_for_catalog(temp_catalog_path, root)
            
            # Append the features for the current file to the main list
            all_features.append(features_df)

# Concatenate all extracted feature data into a single DataFrame
final_dataset = pd.concat(all_features, ignore_index=True)

# Prepare the input features for prediction by removing columns not used for prediction
print(final_dataset.drop(columns=['filename', 'arrival']).dropna(axis=1).columns)
X_test = final_dataset.drop(columns=['filename', 'arrival', 'zcr_threshold_time', 'sta_lta_start_time']).dropna(axis=1).values

# Make predictions using the KNN model
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_abs, y_train_abs)  # Ensure X_train_abs and y_train_abs are defined elsewhere in your script
predictions = knn_model.predict(X_test)

# Store the results with filename and the predicted time_rel values
results = []
for i, pred in enumerate(predictions):
    results.append({
        'filename': final_dataset['filename'].iloc[i],  # Use the corresponding filename
        'time_rel': pred - predictions[0]  # Calculate relative time
    })

# Create a DataFrame from the results
output_df = pd.DataFrame(results)