### Imports

In [6]:
import sys
import os
sys.path.append('../utils/analysis/')

import pandas as pd
import random
import time
import matplotlib.pyplot as plt
import seaborn as sns

from filtering import Filter

### Variables

In [7]:
# Run Specific Tests in Notebook
## Number of Seconds to Run Subject Search Analysis
run_subject_search_analysis = False
n_sub = 500

### Tests

In [8]:
file = "../data/icu_unique_subject_ids.csv"
subject_ids = pd.read_csv(file)["subject_id"].tolist()
print(f"Total unique subject IDs in ICU: {len(subject_ids)}")

Total unique subject IDs in ICU: 65366


In [9]:
filter = Filter("chartevents", debug=False)
df = filter.search_subject(10000690)

[Filter] Initialized for chartevents with 313645063 rows, sorted by subject_id
[Filter] Loaded optimized lookup table from /home/bdg20b/mimic-project/notebooks/../data/icu_unique_subject_ids.csv with columns for chartevents
[search_subject] Searching for subject_id: 10000690
[search_subject] Using byte-offset lookup: offset=46525, length=363134 bytes
[search_subject] Successfully loaded 3842 rows for subject 10000690


### Performance Analysis of Subject Search Operations

The following code cell measures the execution time for performing `search_subject` operations on a random sample of n subject IDs from the ICU dataset. It calculates the time taken for each search, stores these times in a list, and generates a boxplot to visualize the distribution of search times. This provides an approximate estimate of the time required to retrieve data for a single subject, highlighting performance characteristics such as median time, variability, and potential outliers. The plot is saved as an image file for further analysis.

In [10]:
if run_subject_search_analysis:
    # Select a random number of subject_ids (e.g., between 5 and 20)
    num_samples = n_sub
    selected_ids = random.sample(subject_ids, num_samples)

    # List to store execution times
    times = []

    # Measure time for each search_subject call
    for subject_id in selected_ids:
        start_time = time.time()
        filter.search_subject(subject_id)
        end_time = time.time()
        times.append(end_time - start_time)

    # Create a seaborn styled boxplot
    sns.set_theme(style="whitegrid")
    fig, ax = plt.subplots(figsize=(10, 6))

    sns.boxplot(data=times, ax=ax, width=0.5, palette="Set2")

    ax.set_title(f'Time Taken for Subject Searches (n={num_samples})', 
                fontsize=14, fontweight='bold', pad=20)
    ax.set_ylabel('Time (seconds)', fontsize=12, fontweight='bold')
    ax.set_xticklabels(['Search Times'], fontsize=11)

    plt.tight_layout()
    plt.savefig('../data/graphs/SearchTimeBoxPlot.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("Subject search analysis not run. Set 'run_subject_search_analysis' to True to execute.")

Subject search analysis not run. Set 'run_subject_search_analysis' to True to execute.
