In [15]:
import numpy as np
import pandas as pd
from scipy.fft import fft, fftfreq
from obspy.signal.trigger import classic_sta_lta, trigger_onset
from obspy import read
from datetime import datetime
from scipy.signal import find_peaks

# Load the miniSEED file
def load_mseed(file_path):
    st = read(file_path)
    tr = st.traces[0].copy()
    return tr

# Apply a bandpass filter to the trace
def apply_bandpass_filter(trace, min_freq, max_freq):
    tr_filt = trace.copy()
    tr_filt.filter('bandpass', freqmin=min_freq, freqmax=max_freq)
    return tr_filt

# Get the arrival time from the trace
def get_arrival(tr, time):
    starttime = tr.stats.starttime.datetime
    return (time - starttime).total_seconds()

# Function to compute STA/LTA with time
def compute_sta_lta_with_time(trace, sta_len, lta_len):
    sampling_rate = trace.stats.sampling_rate
    cft = classic_sta_lta(trace.data, int(sta_len * sampling_rate), int(lta_len * sampling_rate))
    
    max_sta_lta = np.max(cft)
    max_sta_lta_idx = np.argmax(cft)
    
    # Return the time in relative seconds
    max_sta_lta_time = max_sta_lta_idx / sampling_rate
    
    return cft, max_sta_lta, max_sta_lta_time

# Function to detect STA/LTA events
def detect_sta_lta_events(trace, sta_len, lta_len):
    thr_on = 2.0
    thr_off = 1.0
    cft, _, _ = compute_sta_lta_with_time(trace, sta_len, lta_len)
    on_off = trigger_onset(cft, thr_on, thr_off)

    if len(on_off) > 0:
        start_idx, end_idx = on_off[0]
        start_time = start_idx / trace.stats.sampling_rate
        end_time = end_idx / trace.stats.sampling_rate
        return start_time, end_time
    return None, None

# Split trace into 120-second batches
def split_into_batches(trace, batch_duration_sec=120):
    sampling_rate = trace.stats.sampling_rate
    batch_size = int(batch_duration_sec * sampling_rate)
    total_length = len(trace.data)

    # Create a list of batches
    batches = []
    for i in range(0, total_length, batch_size):
        batch_data = trace.data[i:i + batch_size]
        if len(batch_data) == batch_size:
            batch_trace = trace.copy()
            batch_trace.data = batch_data
            batch_trace.stats.starttime += i / sampling_rate
            batches.append(batch_trace)
    return batches

# Function to detect Zero-Crossing Rate (ZCR) threshold events in batches
def detect_zcr_threshold_in_batches(trace, zcr_threshold):
    batches = split_into_batches(trace)
    for batch in batches:
        zcr_exceed_time = detect_zcr_threshold_events(batch, zcr_threshold)
        if zcr_exceed_time is not None:
            # Return relative time from the start of the trace in seconds
            return (batch.stats.starttime - trace.stats.starttime) + zcr_exceed_time
    return None

# Function to detect ZCR threshold events (individual batch)
def detect_zcr_threshold_events(trace, zcr_threshold):
    data = trace.data
    sampling_rate = trace.stats.sampling_rate

    zero_crossings = np.where(np.diff(np.sign(data)))[0]
    zcr = len(zero_crossings) / len(data)

    if zcr > zcr_threshold and len(zero_crossings) > 0:
        zcr_exceed_time = zero_crossings[0] / sampling_rate
        return zcr_exceed_time
    return None

# Function to detect when the cumulative energy exceeds a threshold in batches
def detect_energy_threshold_in_batches(trace, energy_threshold):
    batches = split_into_batches(trace)
    for batch in batches:
        energy_exceed_time = detect_energy_threshold_events(batch, energy_threshold)
        if energy_exceed_time is not None:
            # Return relative time from the start of the trace in seconds
            return (batch.stats.starttime - trace.stats.starttime) + energy_exceed_time
    return None

# Function to detect energy threshold events (individual batch)
def detect_energy_threshold_events(trace, energy_threshold):
    data = trace.data
    cumulative_energy = np.cumsum(data**2) / np.max(np.cumsum(data**2))

    exceed_indices = np.where(cumulative_energy > energy_threshold)[0]
    
    if len(exceed_indices) > 0:
        energy_exceed_time = exceed_indices[0] / trace.stats.sampling_rate
        return energy_exceed_time
    return None

# Function to detect amplitude spikes in batches
def detect_amplitude_spikes_in_batches(trace, amp_threshold):
    batches = split_into_batches(trace)
    for batch in batches:
        spike_time = detect_amplitude_spikes(batch, amp_threshold)
        if spike_time is not None:
            # Return relative time from the start of the trace in seconds
            return (batch.stats.starttime - trace.stats.starttime) + spike_time
    return None

# Function to detect amplitude spikes (individual batch)
def detect_amplitude_spikes(trace, amp_threshold):
    data = trace.data
    sampling_rate = trace.stats.sampling_rate

    spike_indices = np.where(np.abs(data) > amp_threshold)[0]

    if len(spike_indices) > 0:
        spike_time = spike_indices[0] / sampling_rate
        return spike_time
    return None

# Function to extract STA/LTA features with time
def extract_sta_lta_features_with_time(trace, sta_len, lta_len):
    cft, max_sta_lta, max_sta_lta_time = compute_sta_lta_with_time(trace, sta_len, lta_len)
    mean_sta_lta = np.mean(cft)
    var_sta_lta = np.var(cft)

    return max_sta_lta, max_sta_lta_time, mean_sta_lta, var_sta_lta

# Function to extract amplitude features with min and max times
def extract_time_domain_features_with_max_min_time(trace):
    data = trace.data
    sampling_rate = trace.stats.sampling_rate

    # Mean and RMS amplitude
    mean_amp = np.mean(data)
    rms_amp = np.sqrt(np.mean(data**2))

    # Max and 2nd max amplitude
    sorted_amplitudes = np.argsort(data)  # Indices of sorted amplitudes
    
    max_amp_idx = sorted_amplitudes[-1]  # Index of the max amplitude
    second_max_amp_idx = sorted_amplitudes[-2]  # Index of the second max amplitude

    max_amp = data[max_amp_idx]
    second_max_amp = data[second_max_amp_idx]

    # Min amplitude
    min_amp = np.min(data)
    min_amp_idx = np.argmin(data)

    # Calculate times for max, second max, and min amplitudes (in seconds)
    max_amp_time = max_amp_idx / sampling_rate
    second_max_amp_time = second_max_amp_idx / sampling_rate
    min_amp_time = min_amp_idx / sampling_rate

    return mean_amp, max_amp, max_amp_time, second_max_amp, second_max_amp_time, min_amp, min_amp_time, rms_amp

# Function to extract event duration features
def extract_event_duration(trace, sta_len, lta_len):
    thr_on = 4.0
    thr_off = 1.5
    cft = compute_sta_lta_with_time(trace, sta_len, lta_len)[0]
    on_off = trigger_onset(cft, thr_on, thr_off)

    if len(on_off) > 0:
        event_durations = [(end - start) / trace.stats.sampling_rate for start, end in on_off]
        return np.mean(event_durations) if event_durations else 0
    return 0

# Feature extraction loop
cat_file_path = r'C:\Users\akshi\Machine Learning Projects\Space Apps Challenge\data\lunar\training\catalogs\apollo12_catalog_GradeA_final.csv'
cat = pd.read_csv(cat_file_path)

features_list = []
for i in range(0, cat.shape[0]):
    row = cat.iloc[i]
    try:
        time = datetime.strptime(row['time_abs(%Y-%m-%dT%H:%M:%S.%f)'], '%Y-%m-%dT%H:%M:%S.%f')
        filename = row['filename']

        data_directory = r'C:\Users\akshi\Machine Learning Projects\Space Apps Challenge\data\lunar\training\data\S12_GradeA'
        mseed_file = f'{data_directory}\{filename}.mseed'

        if load_mseed(mseed_file):
            tr = load_mseed(mseed_file)

            # Apply bandpass filter and process features
            arrival = get_arrival(tr, time)
            tr_filt = apply_bandpass_filter(tr, 0.5, 1.0)

            # STA/LTA and amplitude features
            max_sta_lta, max_sta_lta_time, mean_sta_lta, var_sta_lta = extract_sta_lta_features_with_time(tr_filt, sta_len=200, lta_len=475)
            mean_amp, max_amp, max_amp_time, second_max_amp, second_max_amp_time, min_amp, min_amp_time, rms_amp = extract_time_domain_features_with_max_min_time(tr_filt)
            event_duration = extract_event_duration(tr_filt, sta_len=200, lta_len=475)

            # Detect time events for ZCR, energy threshold, amplitude spikes
            zcr_threshold_time = detect_zcr_threshold_in_batches(tr_filt, zcr_threshold=0.5)
            energy_threshold_time = detect_energy_threshold_in_batches(tr_filt, energy_threshold=0.7)
            amp_spike_time = detect_amplitude_spikes_in_batches(tr_filt, amp_threshold=0.8)

            # STA/LTA event detection
            sta_lta_start_time, sta_lta_end_time = detect_sta_lta_events(tr_filt, sta_len=200, lta_len=475)

            features_list.append({
                'filename': filename,
                'arrival': arrival,
                'max_sta_lta': max_sta_lta,
                'max_sta_lta_time': max_sta_lta_time,  # In relative seconds
                'mean_sta_lta': mean_sta_lta,
                'var_sta_lta': var_sta_lta,
                'mean_amp': mean_amp,
                'max_amp': max_amp,
                'max_amp_time': max_amp_time,  # In relative seconds
                'second_max_amp': second_max_amp,
                'second_max_amp_time': second_max_amp_time,  # In relative seconds
                'min_amp': min_amp,
                'min_amp_time': min_amp_time,  # In relative seconds
                'rms_amp': rms_amp,
                'event_duration': event_duration,
                'zcr_threshold_time': zcr_threshold_time,  # In relative seconds
                'energy_threshold_time': energy_threshold_time,  # In relative seconds
                'amp_spike_time': amp_spike_time,  # In relative seconds
                'sta_lta_start_time': sta_lta_start_time,  # In relative seconds
                'sta_lta_end_time': sta_lta_end_time  # In relative seconds
            })

    except FileNotFoundError:
        print(f"File not found: {filename}")

# Convert features to DataFrame and save
df_features = pd.DataFrame(features_list)
df_features


Unnamed: 0,filename,arrival,max_sta_lta,max_sta_lta_time,mean_sta_lta,var_sta_lta,mean_amp,max_amp,max_amp_time,second_max_amp,second_max_amp_time,min_amp,min_amp_time,rms_amp,event_duration,zcr_threshold_time,energy_threshold_time,amp_spike_time,sta_lta_start_time,sta_lta_end_time
0,xa.s12.00.mhz.1970-01-19HR00_evid00002,73499.335,2.374340,37625.811321,1.015039,0.326191,2.723003e-20,7.168520e-09,73991.547170,6.700034e-09,73910.943396,-7.097313e-09,73990.943396,3.000852e-10,0.0,,114.566038,,474.716981,882.867925
1,xa.s12.00.mhz.1970-03-25HR00_evid00003,12719.560,2.295762,16873.962264,0.995377,0.039389,2.426280e-21,4.337924e-09,13448.905660,4.200714e-09,13575.396226,-4.348940e-09,13449.509434,2.318099e-10,0.0,,107.773585,,474.716981,1021.283019
2,xa.s12.00.mhz.1970-03-26HR00_evid00004,73019.435,2.374340,24264.150943,0.998502,0.096869,1.216101e-20,5.446156e-09,73572.377358,4.957813e-09,73571.169811,-5.906712e-09,73571.773585,2.508909e-10,0.0,,115.471698,,474.716981,1388.226415
3,xa.s12.00.mhz.1970-04-25HR00_evid00006,4439.804,2.310864,474.867925,0.994062,0.077505,-4.978128e-21,6.482011e-09,5300.679245,6.272693e-09,5301.886792,-6.443661e-09,5301.283019,2.935849e-10,0.0,,111.547170,,474.716981,1627.018868
4,xa.s12.00.mhz.1970-04-26HR00_evid00007,52139.340,2.296394,474.716981,0.993744,0.050529,-3.539701e-22,3.570016e-09,52970.113208,3.569771e-09,52405.283019,-3.922370e-09,52514.264151,2.289006e-10,0.0,,113.811321,,474.716981,1026.264151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,xa.s12.00.mhz.1974-10-14HR00_evid00156,63779.004,2.374246,1219.773585,1.001939,0.458038,1.885967e-23,5.891982e-09,64182.943396,5.762565e-09,64357.283019,-6.044107e-09,64491.471698,2.325307e-10,0.0,,94.641509,,474.716981,818.867925
71,xa.s12.00.mhz.1975-04-12HR00_evid00191,65699.813,2.363714,696.754717,0.998241,0.258681,-1.125688e-20,5.288960e-08,66322.566038,5.157103e-08,66322.415094,-4.998219e-08,66220.377358,1.837439e-09,0.0,,101.886792,,537.509434,944.603774
72,xa.s12.00.mhz.1975-05-04HR00_evid00192,36299.543,2.369645,36504.754717,0.989006,0.034921,3.310777e-23,1.034582e-07,37448.301887,9.815816e-08,37443.773585,-1.041873e-07,37447.698113,3.437509e-09,0.0,,113.811321,,474.716981,1996.679245
73,xa.s12.00.mhz.1975-06-24HR00_evid00196,57779.761,2.373947,723.924528,0.998972,0.174482,-6.691174e-21,6.624310e-09,57892.226415,6.442783e-09,57892.377358,-6.706438e-09,57891.773585,1.961781e-10,0.0,,48.754717,,474.716981,940.075472


In [16]:
df_features.dropna(axis=1).drop(['filename', 'event_duration'], axis=1).to_csv('Features_New_Expanded.csv', index=False)