### Imports

In [None]:
import sys
import os
sys.path.append('../utils/analysis/')

import pandas as pd
import random
import time
import matplotlib.pyplot as plt
import seaborn as sns

from filtering import File_Filter, Subject_Filter

### Variables

In [None]:
# Run Specific Tests in Notebook
## TEST: System to getting a Single Subject File
run_subject_search_analysis = True
n_sub = 5000

## TEST: System to get all Subject File
run_subject_all_files = True
n_sub_2 = 5000

### Tests

In [None]:
file = "../data/icu_unique_subject_ids.csv"
subject_ids = pd.read_csv(file)["subject_id"].tolist()
print(f"Total unique subject IDs in ICU: {len(subject_ids)}")

In [None]:
filter = File_Filter("chartevents", debug=True)
ff = filter.search_subject(10000690)

### Performance Analysis of Subject Search Operations (File_Filter)

The following code cell measures the execution time for performing `search_subject` operations on a random sample of n subject IDs from the ICU dataset using the new `File_Filter` class. It calculates the time taken for each search, stores these times in a list, and generates a boxplot to visualize the distribution of search times. This provides an approximate estimate of the time required to retrieve data for a single subject from a specific file (chartevents), highlighting performance characteristics such as median time, variability, and potential outliers. The plot is saved as an image file for further analysis.

In [None]:
if run_subject_search_analysis:
    print("\n=== Performance Analysis for File_Filter (chartevents) ===")
    # Select a random number of subject_ids (e.g., between 5 and 20)
    num_samples = n_sub
    selected_ids = random.sample(subject_ids, num_samples)

    # List to store execution times
    times = []
    
    # Initialize File_Filter for chartevents
    ff = File_Filter("chartevents", debug=False)

    # Measure time for each search_subject call
    for subject_id in selected_ids:
        start_time = time.time()
        ff.search_subject(subject_id)
        end_time = time.time()
        times.append(end_time - start_time)

    # Create a seaborn styled boxplot
    sns.set_theme(style="whitegrid")
    fig, ax = plt.subplots(figsize=(10, 6))

    sns.boxplot(data=times, ax=ax, width=0.5, palette="Set2")

    ax.set_title(f'Time Taken for File_Filter Subject Searches - chartevents (n={num_samples})', 
                fontsize=14, fontweight='bold', pad=20)
    ax.set_ylabel('Time (seconds)', fontsize=12, fontweight='bold')
    ax.set_xticklabels(['Search Times'], fontsize=11)

    plt.tight_layout()
    plt.savefig('../data/graphs/FileFilterSearchTimeBoxPlot.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Completed performance analysis for {num_samples} subjects on chartevents")
else:
    print("Subject search analysis not run. Set 'run_subject_search_analysis' to True to execute.")

### Testing New Architecture (Refactor)

Testing the new `File_Filter` and `Subject_Filter` classes.

#### Testing File_Filter

In [None]:
start_time = time.time()
ff = File_Filter("chartevents", debug=False)
df_ff = ff.search_subject(10000690)
end_time = time.time()

print(f"File_Filter search took {end_time - start_time:.4f}s")
print(f"Rows found: {len(df_ff)}")
df_ff.head()

#### Testing Subject_Filter (Cross-File)

In [None]:
start_time = time.time()
sf = Subject_Filter(debug=False)
data_dict = sf.get_all_subject_data(10000690)
end_time = time.time()

print(f"Subject_Filter search took {end_time - start_time:.4f}s")
for file_id, df_data in data_dict.items():
    rows = len(df_data) if df_data is not None else 0
    print(f"{file_id}: {rows} rows")

In [None]:
# Performance analysis for Subject_Filter across multiple subjects
if run_subject_all_files:
    import sys
    print("\n=== Performance Analysis for Subject_Filter (All Files) ===")
    sys.stdout.flush()
    num_samples = n_sub_2
    selected_ids = random.sample(subject_ids, num_samples)
    
    times = []
    i = 0
    for subject_id in selected_ids:
        start_time = time.time()
        sf.get_all_subject_data(subject_id)
        end_time = time.time()
        times.append(end_time - start_time)
        i += 1
        if i % 25 == 0:
            avg_time = sum(times) / len(times)
            print(f"Average time after {i} subjects: {avg_time:.4f}s", flush=True)
    
    # Create a seaborn styled boxplot
    sns.set_theme(style="whitegrid")
    fig, ax = plt.subplots(figsize=(10, 6))
    
    sns.boxplot(data=times, ax=ax, width=0.5, palette="Set2")
    
    ax.set_title(f'Time Taken for Subject_Filter All Files Searches (n={num_samples})', 
                fontsize=14, fontweight='bold', pad=20)
    ax.set_ylabel('Time (seconds)', fontsize=12, fontweight='bold')
    ax.set_xticklabels(['Search Times'], fontsize=11)
    
    plt.tight_layout()
    plt.savefig('../data/graphs/SubjectFilterAllFilesBoxPlot.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Completed performance analysis for {num_samples} subjects", flush=True)
else:
    print("Subject all files analysis not run. Set 'run_subject_all_files' to True to execute.")