In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, explode, array
from pyspark.sql.types import DoubleType, ArrayType, StringType
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("EEG Analysis") \
    .getOrCreate()

# Path to data
BASE_PATH = '/kaggle/input/hms-harmful-brain-activity-classification/'
FILE_PATH = BASE_PATH + 'train_eegs/1000913311.parquet'



# # Shutdown Spark
# spark.stop()

# **About the Problem**

There are 6 patterns to be identified:

seizure (SZ)
generalized periodic discharges (GPD)
lateralized periodic discharges (LPD)
lateralized rhythmic delta activity (LRDA)
generalized rhythmic delta activity (GRDA)
other

The annotations were made by a group of experts, however the challenge is that not even the experts can fully agree on a case 100% of the time. Hence, the competition creates a second set of labels:

where there are high levels of agreement => “idealized” patterns
where ~1-2 experts give a label as “other” and ~1-2 give one of the remaining five labels => “proto” patterns
where experts are approximately split between 2 of the 5 named patterns => “edge” cases


In [3]:
df_eeg = spark.read.parquet(FILE_PATH)
df_eeg

DataFrame[Fp1: float, F3: float, C3: float, P3: float, F7: float, T3: float, T5: float, O1: float, Fz: float, Cz: float, Pz: float, Fp2: float, F4: float, C4: float, P4: float, F8: float, T4: float, T6: float, O2: float, EKG: float]

In [4]:
 df_eeg.show(5)


+-------+-------+------+------+-------+-------+------+------+------+------+------+------+------+-------+------+------+------+------+-------+--------+
|    Fp1|     F3|    C3|    P3|     F7|     T3|    T5|    O1|    Fz|    Cz|    Pz|   Fp2|    F4|     C4|    P4|    F8|    T4|    T6|     O2|     EKG|
+-------+-------+------+------+-------+-------+------+------+------+------+------+------+------+-------+------+------+------+------+-------+--------+
|-105.85| -89.23|-79.46|-49.23| -99.73| -87.77|-53.33|-50.74|-32.25| -42.1|-43.27|-88.73|-74.41| -92.46|-58.93|-75.74|-59.47|  8.21|  66.49| 1404.93|
| -85.47| -75.07|-60.26|-38.92| -73.08| -87.51|-39.68|-35.63|-76.84|-62.74|-43.04|-68.63|-61.69| -69.32|-35.79| -58.9|-41.66|196.19| 230.67| 3402.67|
|   8.84|  34.85| 56.43| 67.97|   48.1|  25.35| 80.25| 48.06|  6.72| 37.88|  61.0| 16.58| 55.06|  45.02| 70.53| 47.82| 72.03|-67.18|-171.31| -3565.8|
| -56.32| -37.28| -28.1| -2.82| -43.43| -35.05|  3.91|-12.66|  8.65|  3.83|  4.18| -51.9|-21.89| -41

In [5]:
 df = spark.read.csv("/kaggle/input/hms-harmful-brain-activity-classification/train.csv", 
                     header=True, inferSchema=True)

# Display the first few rows
df.show(5)

+----------+----------+------------------------+--------------+------------------+--------------------------------+----------+----------+----------------+------------+--------+--------+---------+---------+----------+
|    eeg_id|eeg_sub_id|eeg_label_offset_seconds|spectrogram_id|spectrogram_sub_id|spectrogram_label_offset_seconds|  label_id|patient_id|expert_consensus|seizure_vote|lpd_vote|gpd_vote|lrda_vote|grda_vote|other_vote|
+----------+----------+------------------------+--------------+------------------+--------------------------------+----------+----------+----------------+------------+--------+--------+---------+---------+----------+
|1628180742|         0|                     0.0|        353733|                 0|                             0.0| 127492639|     42516|         Seizure|           3|       0|       0|        0|        0|         0|
|1628180742|         1|                     6.0|        353733|                 1|                             6.0|3887563113|     4

In [6]:
# if "eval" in FLAGS:
#     import os

#     # Set the environment variable
#     os.environ["PYSPARK_PIN_THREAD"] = "False"
#     # spark.builder.config("spark.jars.packages", "org.mlflow.mlflow-spark")
#     import mlflow

#     # mlflow.set_tracking_uri("http://127.0.0.0:5000")
#     mlflow.set_tracking_uri("http://localhost:5000")
#     mlflow.autolog()



In [7]:
 # Extract column names
columns = df.columns
TARGETS = columns[-6:]

# Print shape (row count and column count)
print("Train shape:", (df.count(), len(columns)))

# Display target column names
print("Target Labels:", TARGETS)


Train shape: (106800, 15)
Target Labels: ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']


In [8]:
from pyspark.sql.functions import col

# Count the number of occurrences of each EEG pattern
for target in TARGETS:
    df.groupBy(target).count().orderBy(col("count").desc()).show()


+------------+-----+
|seizure_vote|count|
+------------+-----+
|           0|73906|
|           3|19520|
|           1| 6475|
|           2| 2329|
|           5| 1825|
|           4| 1745|
|           6|  336|
|           7|  313|
|           8|   91|
|           9|   57|
|          10|   54|
|          15|   36|
|          13|   30|
|          11|   29|
|          14|   25|
|          12|   22|
|          19|    4|
|          16|    3|
+------------+-----+

+--------+-----+
|lpd_vote|count|
+--------+-----+
|       0|77675|
|       1| 9680|
|       2| 4618|
|       3| 4011|
|       4| 2290|
|       5| 1323|
|       6| 1065|
|       7|  863|
|      13|  769|
|      14|  739|
|      10|  629|
|       8|  616|
|      12|  589|
|       9|  574|
|      15|  557|
|      11|  545|
|      17|  120|
|      16|   92|
|      18|   45|
+--------+-----+

+--------+-----+
|gpd_vote|count|
+--------+-----+
|       0|82027|
|       1| 5643|
|       2| 4352|
|       3| 2756|
|      10| 2052|
|      11

In [10]:
from pyspark.sql.functions import first, min, max, sum, col

# Select the first spectrogram_id and earliest spectrogram_label_offset_seconds for each eeg_id
train = df.groupBy("eeg_id").agg(
    first("spectrogram_id").alias("spec_id"),
    min("spectrogram_label_offset_seconds").alias("min")
)

# Find the latest spectrogram_label_offset_seconds
tmp = df.groupBy("eeg_id").agg(max("spectrogram_label_offset_seconds").alias("max"))
train = train.join(tmp, on="eeg_id", how="left")


In [11]:
tmp = df.groupBy("eeg_id").agg(first("patient_id").alias("patient_id"))
train = train.join(tmp, on="eeg_id", how="left")


In [12]:
target_agg = df.groupBy("eeg_id").agg(
    *[sum(col(t)).alias(t) for t in TARGETS]  # Sum votes for each target label
)

train = train.join(target_agg, on="eeg_id", how="left")

In [13]:
train.show()

+----------+---------+------+------+----------+------------+--------+--------+---------+---------+----------+
|    eeg_id|  spec_id|   min|   max|patient_id|seizure_vote|lpd_vote|gpd_vote|lrda_vote|grda_vote|other_vote|
+----------+---------+------+------+----------+------------+--------+--------+---------+---------+----------+
|2508460624| 14160642|1244.0|1278.0|      1218|           0|      24|       0|        0|        0|         0|
|3900790925| 21532485| 464.0| 464.0|     42739|           0|       0|       0|        0|        0|         1|
|2080477284| 55011312|   0.0| 128.0|     50648|           0|       0|       0|        0|       87|         0|
|2186024367| 58286428|   0.0|  14.0|      7041|           0|       6|       0|        0|       60|        24|
|2970840697| 61462353|   0.0|  68.0|      8376|           0|       0|       0|        0|       63|         0|
| 589472925|112782480|   0.0| 152.0|      3507|          80|       0|       0|        0|        0|         0|
|116698644

In [39]:
train = df
train.show()

+----------+----------+------------------------+--------------+------------------+--------------------------------+----------+----------+----------------+------------+--------+--------+---------+---------+----------+------------+
|    eeg_id|eeg_sub_id|eeg_label_offset_seconds|spectrogram_id|spectrogram_sub_id|spectrogram_label_offset_seconds|  label_id|patient_id|expert_consensus|seizure_vote|lpd_vote|gpd_vote|lrda_vote|grda_vote|other_vote|index_column|
+----------+----------+------------------------+--------------+------------------+--------------------------------+----------+----------+----------------+------------+--------+--------+---------+---------+----------+------------+
|1628180742|         0|                     0.0|        353733|                 0|                             0.0| 127492639|     42516|         Seizure|           3|       0|       0|        0|        0|         0|           0|
|1628180742|         1|                     6.0|        353733|                 

In [40]:
from pyspark.sql.functions import sum as spark_sum

from pyspark.sql.functions import monotonically_increasing_id

 

vote_cols = [col for col in train.columns if '_vote' in col] 
print("vote cols:", vote_cols)
colss=["eeg_id", "spectrogram_id", "patient_id"]

# Group by eeg_id, spectrogram_id, patient_id and sum the vote columns
train_group = train.groupBy("eeg_id", "spectrogram_id", "patient_id")\
                   .agg(*[spark_sum(col).alias(col) for col in vote_cols]).withColumn("index_column", monotonically_increasing_id())

train_group.show(7)


vote cols: ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
+----------+--------------+----------+------------+--------+--------+---------+---------+----------+------------+
|    eeg_id|spectrogram_id|patient_id|seizure_vote|lpd_vote|gpd_vote|lrda_vote|grda_vote|other_vote|index_column|
+----------+--------------+----------+------------+--------+--------+---------+---------+----------+------------+
| 736446371|      10397461|     29441|           0|       1|       0|        0|        0|         0|           0|
|2609269581|      27417697|     44246|           0|       0|       6|        0|        0|         9|           1|
|2148709110|      43261985|     25986|           0|       2|       0|        0|        0|         0|           2|
|4067106618|     114346114|     53681|           2|      26|       0|        2|        0|         0|           3|
|1457886247|     120589953|     26524|           6|       0|       0|        0|        0|         0|          

In [59]:
from pyspark.sql.functions import sum as spark_sum, monotonically_increasing_id

# Add an index column if needed (but not necessary for groupBy)
train = train.withColumn("index_column", monotonically_increasing_id())

# Define vote columns
vote_cols = [col for col in train.columns if '_vote' in col]
print("Vote cols:", vote_cols)

# Group by eeg_id, spectrogram_id, patient_id and sum the vote columns
train_group = train.groupBy("eeg_id", "spectrogram_id", "patient_id")\
                   .agg(*[spark_sum(col).alias(col) for col in vote_cols])

# Show the first 7 rows
train_group.show(7)


Vote cols: ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
+----------+--------------+----------+------------+--------+--------+---------+---------+----------+
|    eeg_id|spectrogram_id|patient_id|seizure_vote|lpd_vote|gpd_vote|lrda_vote|grda_vote|other_vote|
+----------+--------------+----------+------------+--------+--------+---------+---------+----------+
| 736446371|      10397461|     29441|           0|       1|       0|        0|        0|         0|
|2609269581|      27417697|     44246|           0|       0|       6|        0|        0|         9|
|2148709110|      43261985|     25986|           0|       2|       0|        0|        0|         0|
|4067106618|     114346114|     53681|           2|      26|       0|        2|        0|         0|
|1457886247|     120589953|     26524|           6|       0|       0|        0|        0|         0|
|3307432033|     158642283|     28415|           0|       0|       0|        0|        4|        56|

In [61]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import numpy, pandas
import pyspark.pandas as ps
import numpy as np
ps.set_option('compute.ops_on_diff_frames', True)

def categorize_votes(row):
    # compute max and sum
    col_names = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
    max_vote = row[col_names].max()
    total_votes = row[col_names].sum()

    # % votes for max fruit
    percentage = max_vote / total_votes * 100

    high_agreement_threshold = 70
    equal_splitting_threshold = 40

    if percentage >= high_agreement_threshold:
        return 'idealized'
    elif row['other_vote'] / total_votes >= 0.4 and percentage >= equal_splitting_threshold:
        return 'proto'
    elif row['other_vote'] == 0 and percentage >= equal_splitting_threshold:
        return 'edge'
    else:
        return 'undecided'


 
train_group= ps.DataFrame(train_group)
train_group['pattern'] = train_group.apply(categorize_votes, axis=1)
train_group.head(7)





Unnamed: 0,eeg_id,spectrogram_id,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,pattern
0,736446371,10397461,29441,0,1,0,0,0,0,idealized
1,2609269581,27417697,44246,0,0,6,0,0,9,proto
2,2148709110,43261985,25986,0,2,0,0,0,0,idealized
3,4067106618,114346114,53681,2,26,0,2,0,0,idealized
4,1457886247,120589953,26524,6,0,0,0,0,0,idealized
5,3307432033,158642283,28415,0,0,0,0,4,56,idealized
6,3691913454,172731638,35225,0,24,0,0,0,14,undecided


In [17]:
# from functools import reduce
# from pyspark.sql.functions import col

# # Compute total votes as the sum of all target columns
# train = train.withColumn(
#     "total_votes",
#     reduce(lambda x, y: x + y, [col(t) for t in TARGETS])  # Sum up all target columns
# )


# from pyspark.sql.functions import when

# # Avoid division by zero using `when`
# for t in TARGETS:
#     train = train.withColumn(t, when(col("total_votes") > 0, col(t) / col("total_votes")).otherwise(0))

# # Drop `total_votes` after normalization
# # train = train.drop("total_votes")
# train.show()

In [38]:
# tmp = df.groupBy("eeg_id").agg(first("expert_consensus").alias("target"))
# train = train.join(tmp, on="eeg_id", how="left")
# train.show()

In [20]:
# train = train.orderBy("eeg_id")  # Optional: Order by EEG ID
# train.show(5)

# print("Train non-overlapping EEG ID shape:", (train.count(), len(train.columns)))


+------+----------+------+------+----------+------------+------------------+--------+-------------------+-------------------+-------------------+------+
|eeg_id|   spec_id|   min|   max|patient_id|seizure_vote|          lpd_vote|gpd_vote|          lrda_vote|          grda_vote|         other_vote|target|
+------+----------+------+------+----------+------------+------------------+--------+-------------------+-------------------+-------------------+------+
|568657| 789577333|   0.0|  16.0|     20654|         0.0|               0.0|    0.25|                0.0|0.16666666666666666| 0.5833333333333334| Other|
|582999|1552638400|   0.0|  38.0|     20230|         0.0|0.8571428571428571|     0.0|0.07142857142857142|                0.0|0.07142857142857142|   LPD|
|642382|  14960202|1008.0|1032.0|      5955|         0.0|               0.0|     0.0|                0.0|                0.0|                1.0| Other|
|751790| 618728447| 908.0| 908.0|     38549|         0.0|               0.0|     1

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, min
import os

# Initialize Spark session 

# Define path to spectrogram parquet files
PATH = "/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/"

# List all parquet files
files = [f for f in os.listdir(PATH) if f.endswith(".parquet")]

print(f"There are {len(files)} spectrogram parquet files.")

 

There are 11138 spectrogram parquet files.


In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import numpy as np

 
if READ_SPEC_FILES:    
    spectrograms = {}

    # Read all Parquet files into a single PySpark DataFrame
    df = spark.read.parquet(f"{PATH}*.parquet")

    # Convert filenames into `spec_id`
    df = df.withColumn("spec_id", col("filename").substr(1, 10).cast("int"))

    # Collect as a dictionary {spec_id: spectrogram array}
    spectrograms = {row["spec_id"]: np.array(row[1:]) for row in df.collect()}

else:
    spectrograms = np.load('/kaggle/input/brain-spectrograms/specs.npy', allow_pickle=True).item()


In [23]:


READ_SPEC_FILES = False # If READ_SPEC_FILES is False, the code reads the combined file instead of individual files.
FEATURE_ENGINEER = True



In [24]:


# %time
# # ENGINEER FEATURES
# import warnings
# warnings.filterwarnings('ignore')

# # The code generates features from the spectrogram data for use in a model 
# # The features are derived by calculating the mean and minimum values over time for each of the 400 spectrogram frequencies.
# # Two types of windows are used for these calculations:
# # A 10-minute window (_mean_10m, _min_10m).
# # A 20-second window (_mean_20s, _min_20s).
# # This process results in 1600 features (400 features × 4 calculations) for each EEG ID.

#  # = pd.read_parquet(f'{PATH}1000086677.parquet').columns[1:]
# SPEC_COLS = spark.read.parquet(f'{PATH}1000086677.parquet').columns[1:]

# # # Get all columns except the first one
# # SPEC_COLS = df

# FEATURES = [f'{c}_mean_10m' for c in SPEC_COLS]
# FEATURES += [f'{c}_min_10m' for c in SPEC_COLS]
# FEATURES += [f'{c}_mean_20s' for c in SPEC_COLS]
# FEATURES += [f'{c}_min_20s' for c in SPEC_COLS]
# print(f'We are creating { len(FEATURES)} features for { (train).count()} rows... ',end='')


# # A data matrix data is initialized to store the new features for each eeg_id in the train DataFrame.
# # For each row in train, the code calculates the mean and minimum values within the specified 10-minute and 20-second windows.
# # These calculated values are then stored in the data matrix.
# # Finally, the matrix is added to the train DataFrame as new columns.

# if FEATURE_ENGINEER:
#     data = np.zeros(( (train.count()),len(FEATURES)))
#     for k in range( (train).count()):
#         if k%100==0: print(k,', ',end='')
#         row = train.collect()[k]  # ✅ Correct for retrieving a specific row

#         r = int( (row['min'] + row['max'])//4 ) 
        
#         # 10 MINUTE WINDOW FEATURES (MEANS and MINS)
#         x = np.nanmean(spectrograms[row.spec_id][r:r+300,:],axis=0)
#         data[k,:400] = x
#         x = np.nanmin(spectrograms[row.spec_id][r:r+300,:],axis=0)
#         data[k,400:800] = x
        
#         # 20 SECOND WINDOW FEATURES (MEANS and MINS)
#         x = np.nanmean(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
#         data[k,800:1200] = x
#         x = np.nanmin(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
#         data[k,1200:1600] = x

#     train[FEATURES] = data
# else:
#     train = pd.read_parquet('/kaggle/input/brain-spectrograms/train.pqt')
# print()
# print('New train shape:',train.shape)



In [41]:
print("Train shape:", (train.count(), len(train.columns)), "\n")  # PySpark equivalent of shape
print("Unique eeg_ids: ", train.select("eeg_id").distinct().count())
print(train.groupBy("eeg_id").count().describe().show(), "\n")
print("Unique spectrogram_ids: ", train.select("spec_id").distinct().count())

print("Unique patient_ids: ", train.select("patient_id").distinct().count(), "\n")


Train shape: (17089, 12) 

Unique eeg_ids:  17089
+-------+--------------------+-----+
|summary|              eeg_id|count|
+-------+--------------------+-----+
|  count|               17089|17089|
|   mean|2.1352255234347825E9|  1.0|
| stddev|1.2357121085973365E9|  0.0|
|    min|              568657|    1|
|    max|          4294958358|    1|
+-------+--------------------+-----+

None 

Unique spectrogram_ids:  11138
Unique patient_ids:  1950 



In [44]:
vote_cols = [col for col in train.columns if '_vote' in col]
print("vote cols:", vote_cols)

from pyspark.sql.functions import sum

train_group = train.groupBy("eeg_id", "spec_id", "patient_id").agg(
    *[sum(col).alias(col) for col in vote_cols]  # Sum each vote column
)

train_group.show(7)  # Display first 7 rows


vote cols: ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
+----------+---------+----------+------------+-------------------+--------+---------+------------------+-------------------+
|    eeg_id|  spec_id|patient_id|seizure_vote|           lpd_vote|gpd_vote|lrda_vote|         grda_vote|         other_vote|
+----------+---------+----------+------------+-------------------+--------+---------+------------------+-------------------+
|2508460624| 14160642|      1218|         0.0|                1.0|     0.0|      0.0|               0.0|                0.0|
|3900790925| 21532485|     42739|         0.0|                0.0|     0.0|      0.0|               0.0|                1.0|
|2080477284| 55011312|     50648|         0.0|                0.0|     0.0|      0.0|               1.0|                0.0|
|2186024367| 58286428|      7041|         0.0|0.06666666666666667|     0.0|      0.0|0.6666666666666666|0.26666666666666666|
|2970840697| 61462353|      8376|

In [48]:
from pyspark.sql.functions import col, greatest, sum, when

# Compute max_vote and total_votes
train_group = train_group.withColumn("max_vote", greatest(
    col("seizure_vote"), col("lpd_vote"), col("gpd_vote"),
    col("lrda_vote"), col("grda_vote"), col("other_vote")
))
train_group = train_group.withColumn("total_votes",
    col("seizure_vote") + col("lpd_vote") + col("gpd_vote") +
    col("lrda_vote") + col("grda_vote") + col("other_vote")
)

# Compute percentage
train_group = train_group.withColumn("percentage", (col("max_vote") / col("total_votes")) * 100)

# Apply classification logic
train_group = train_group.withColumn(
    "pattern",
    when(col("percentage") >= 70, "idealized")
    .when((col("other_vote") / col("total_votes") >= 0.4) & (col("percentage") >= 40), "proto")
    .when((col("other_vote") == 0) & (col("percentage") >= 40), "edge")
    .otherwise("undecided")
)

train_group.show(7)


+----------+---------+----------+------------+-------------------+--------+---------+------------------+-------------------+---------+------------------+-----------+-----------------+
|    eeg_id|  spec_id|patient_id|seizure_vote|           lpd_vote|gpd_vote|lrda_vote|         grda_vote|         other_vote|  pattern|          max_vote|total_votes|       percentage|
+----------+---------+----------+------------+-------------------+--------+---------+------------------+-------------------+---------+------------------+-----------+-----------------+
|2508460624| 14160642|      1218|         0.0|                1.0|     0.0|      0.0|               0.0|                0.0|idealized|               1.0|        1.0|            100.0|
|3900790925| 21532485|     42739|         0.0|                0.0|     0.0|      0.0|               0.0|                1.0|idealized|               1.0|        1.0|            100.0|
|2080477284| 55011312|     50648|         0.0|                0.0|     0.0|     

In [51]:
train[train.eeg_id==722738444].show()

+---------+-------+---+----+----------+------------+--------+--------+---------+---------+----------+------+
|   eeg_id|spec_id|min| max|patient_id|seizure_vote|lpd_vote|gpd_vote|lrda_vote|grda_vote|other_vote|target|
+---------+-------+---+----+----------+------------+--------+--------+---------+---------+----------+------+
|722738444| 999431|0.0|24.0|     56885|         0.0|  0.0625|     0.0|    0.875|      0.0|    0.0625|  LRDA|
+---------+-------+---+----+----------+------------+--------+--------+---------+---------+----------+------+



In [52]:
spectrogram_id = 789577333

# read in the data
spec_base_path = "/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/"
spec_data = pd.read_parquet(spec_base_path + str(spectrogram_id) + ".parquet")

print(spec_data.shape)

spec_data.head()

(308, 401)


Unnamed: 0,time,LL_0.59,LL_0.78,LL_0.98,LL_1.17,LL_1.37,LL_1.56,LL_1.76,LL_1.95,LL_2.15,...,RP_18.16,RP_18.36,RP_18.55,RP_18.75,RP_18.95,RP_19.14,RP_19.34,RP_19.53,RP_19.73,RP_19.92
0,1,180.660004,211.25,209.270004,159.089996,113.300003,97.82,39.529999,49.080002,69.209999,...,0.84,1.51,1.25,1.74,2.39,2.04,3.48,2.67,2.81,1.77
1,3,134.839996,210.960007,272.209991,206.259995,187.899994,139.059998,111.860001,99.080002,107.290001,...,0.47,0.58,0.6,0.55,0.6,0.47,0.57,0.54,0.56,0.61
2,5,94.389999,130.330002,138.309998,159.229996,132.679993,160.929993,112.449997,141.649994,90.589996,...,0.32,0.32,0.44,0.45,0.43,0.42,0.51,0.81,1.4,1.43
3,7,227.080002,307.809998,282.970001,265.160004,275.470001,166.610001,138.5,149.380005,91.410004,...,2.43,4.47,6.61,8.48,9.24,10.46,11.84,12.83,12.84,11.61
4,9,243.559998,352.869995,353.369995,298.890015,263.920013,141.300003,130.360001,108.129997,107.110001,...,2.64,3.7,6.23,9.01,10.95,11.67,11.74,11.89,11.27,10.14


In [1]:
# print("Frequency Columns:", freq_cols)
# print("Available Columns:", df.columns)
# # Identify spectrogram feature columns (exclude non-numeric and timestamp)
# freq_cols = [c for c in df.columns if c.startswith(("LL_", "RL_", "LP_", "RP_"))]

# # Debug: Print extracted frequency columns
# print("✅ Frequency Columns:", freq_cols)

# # If still empty, raise an error
# if not freq_cols:
#     raise ValueError("❌ No frequency columns found! Check column naming pattern.")


In [57]:
edge_df = train_group[train_group.pattern=="edge"] 
 
edge_df.show()

+----------+----------+----------+-------------------+-------------------+-------------------+-------------------+------------------+----------+-------+------------------+-----------+-----------------+
|    eeg_id|   spec_id|patient_id|       seizure_vote|           lpd_vote|           gpd_vote|          lrda_vote|         grda_vote|other_vote|pattern|          max_vote|total_votes|       percentage|
+----------+----------+----------+-------------------+-------------------+-------------------+-------------------+------------------+----------+-------+------------------+-----------+-----------------+
|1443873668| 279961928|      5512|                0.0| 0.3333333333333333| 0.6666666666666666|                0.0|               0.0|       0.0|   edge|0.6666666666666666|        1.0|66.66666666666666|
|3500954630| 447919898|     57480| 0.4166666666666667|0.16666666666666666|                0.0| 0.4166666666666667|               0.0|       0.0|   edge|0.4166666666666667|        1.0|41.666666

In [1]:
# number of spectrograms for each category
N = 5

spec_dict = {
    "seizure_vote": 0,
    "lpd_vote": 0,
    "gpd_vote": 0,
    "lrda_vote":0, 
    "grda_vote":0,
    "other_vote":0
}

idealized_df = train_group[train_group.pattern=="idealized"].reset_index(drop=True)

for key in spec_dict.keys():
    col_idx = idealized_df[key].sort_values(ascending=False).head(N).index
    spec_dict[key] = idealized_df.loc[col_idx, "spectrogram_id"].values
   

 
pprint(spec_dict)



NameError: name 'train_group' is not defined