In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statistics
import os
import warnings
warnings.filterwarnings('ignore')

os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
VER = 2

pd.set_option('display.max_columns', None)

df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
test = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
sub = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/sample_submission.csv')

TARGETS = df.columns[-6:]
READ_SPEC_FILES = False

# READ ALL SPECTROGRAMS
PATH = '/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/'
files = os.listdir(PATH)
print(f'There are {len(files)} spectrogram parquets')

ModuleNotFoundError: No module named 'pandas'

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
# identify the starting point of each EEG session in terms of its spectrogram data
train = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
train.columns = ['spec_id','min']


# determine the end point of the spectrogram data for each session
tmp = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_label_offset_seconds':'max'})
train['max'] = tmp

# The patient_id is extracted for each eeg_id, ensuring that 
# train has the patient_id associated with each EEG session

tmp = df.groupby('eeg_id')[['patient_id']].agg('first')
train['patient_id'] = tmp

tmp = df.groupby('eeg_id')[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].values

# target variables are being consolidated at the EEG session level,
# indicating the total occurrences of events
    
y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

tmp = df.groupby('eeg_id')[['expert_consensus']].agg('first')
train['target'] = tmp

train = train.reset_index()
print('Train non-overlapp eeg_id shape:', train.shape )
train.head()

In [None]:
if READ_SPEC_FILES:    
    spectrograms = {}
    for i,f in enumerate(files):
        if i%100==0: print(i,', ',end='')
        tmp = pd.read_parquet(f'{PATH}{f}')
        name = int(f.split('.')[0])
        spectrograms[name] = tmp.iloc[:,1:].values
else:
    spectrograms = np.load('/kaggle/input/brain-spectrograms/specs.npy',allow_pickle=True).item()

In [None]:
FEATURE_ENGINEER = True

# FEATURE NAMES
SPEC_COLS = pd.read_parquet(f'{PATH}1000086677.parquet').columns[1:]
FEATURES = [f'{c}_mean_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_min_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_mean_20s' for c in SPEC_COLS]
FEATURES += [f'{c}_min_20s' for c in SPEC_COLS]
print(f'We are creating {len(FEATURES)} features for {len(train)} rows... ',end='')

if FEATURE_ENGINEER:
    data = np.zeros((len(train),len(FEATURES)))
    for k in range(len(train)):
        if k%100==0: print(k,', ',end='')
        row = train.iloc[k]
        r = int( (row['min'] + row['max'])//4 ) 
        
        # 10 MINUTE WINDOW FEATURES (MEANS and MINS)
        x = np.nanmean(spectrograms[row.spec_id][r:r+300,:],axis=0)
        data[k,:400] = x
        x = np.nanmin(spectrograms[row.spec_id][r:r+300,:],axis=0)
        data[k,400:800] = x
        
        # 20 SECOND WINDOW FEATURES (MEANS and MINS)
        x = np.nanmean(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
        data[k,800:1200] = x
        x = np.nanmin(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
        data[k,1200:1600] = x

    train[FEATURES] = data
else:
    train = pd.read_parquet('/kaggle/input/brain-spectrograms/train.pqt')
print()
print('New train shape:',train.shape)

In [None]:
train.head(2)

In [None]:
selected_columns = train.iloc[:, :12]
selected_columns.head(2)

In [None]:
from typing import Optional
import pandas as pd
import pandas.api.types

class ParticipantVisibleError(Exception):
    pass

def kl_divergence(solution: pd.DataFrame, submission: pd.DataFrame, epsilon: float, micro_average: bool, sample_weights: Optional[pd.Series]):
#     # Debugging information
#     print("Solution DataFrame:")
#     print(f"Shape: {solution.shape}")
#     print(f"Columns: {solution.columns}")
#     print("Submission DataFrame:")
#     print(f"Shape: {submission.shape}")
#     print(f"Columns: {submission.columns}")

    submission = submission[solution.columns]

    for col in solution.columns:
        if not pd.api.types.is_float_dtype(solution[col]):
            solution[col] = solution[col].astype(float)
        submission[col] = np.clip(submission[col], epsilon, 1 - epsilon)
        y_nonzero_indices = solution[col] != 0
        solution[col] = solution[col].astype(float)
        
#         print(f"Debug Info - col: {col}, y_nonzero_indices: {y_nonzero_indices}")
        
        # Check for non-finite values in both DataFrames
        if not np.isfinite(solution[col]).all() or not np.isfinite(submission[col]).all():
            raise ParticipantVisibleError(f"Non-finite values detected in column {col}")
        
        # Calculate KL Divergence directly using NumPy arrays
        solution_np = solution[col].to_numpy()
        submission_np = submission[col].to_numpy()
        solution_np[y_nonzero_indices] *= np.log(solution_np[y_nonzero_indices] / submission_np[y_nonzero_indices])
        solution_np[~y_nonzero_indices] = 0
        
        # Update the DataFrame column with the calculated values
        solution[col] = solution_np
    
    if micro_average:
        return np.average(solution.sum(axis=1), weights=sample_weights)
    else:
        return np.average(solution.mean())

def score(solution: pd.DataFrame, submission: pd.DataFrame, epsilon: float=10**-15, micro_average: bool=True, sample_weights_column_name: Optional[str]=None) -> float:
    sample_weights = None
    if sample_weights_column_name:
        if sample_weights_column_name not in solution.columns:
            raise ParticipantVisibleError(f'{sample_weights_column_name} not found in solution columns')
        sample_weights = solution.pop(sample_weights_column_name)
    if sample_weights_column_name and not micro_average:
        raise ParticipantVisibleError('Sample weights are only valid if `micro_average` is `True`')
    for col in solution.columns:
        if col not in submission.columns:
            raise ParticipantVisibleError(f'Missing submission column {col}')
    return kl_divergence(solution, submission, epsilon, micro_average, sample_weights)

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import log_loss

#### CATBOOST PARAMETERS ####
params = {
    'verbose': False,
    'random_seed': 42,
    'iterations': 2000,
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'task_type':'GPU'
}

target = 'target'
X = train.drop(columns=['target','seizure_vote', 'lpd_vote', 'min', 'max', 'spec_id', 
                       'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote', 'eeg_id',
                       'patient_id'])
y = train[target].values
y_series = pd.Series(y, index=X.index)
feature_names = X.columns.tolist()
classes = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}

# gkf = GroupKFold(n_splits=5)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_num = 1
models = []
feature_importances = np.zeros(X.shape[1])
kl_divergence_scores_train = []
kl_divergence_scores_valid = []

for train_index, valid_index in skf.split(X, train['target']):
# for train_index, valid_index in gkf.split(X, y_series, groups=train['patient_id']):
    xtrain, xvalid = X.iloc[train_index], X.iloc[valid_index]
    ytrain, yvalid = y_series[train_index], y_series[valid_index]
    
    train_pool = Pool(xtrain, ytrain.map(classes), feature_names=feature_names)
    valid_pool = Pool(xvalid, yvalid.map(classes), feature_names=feature_names)
    
    cat_model = CatBoostClassifier(**params)
    cat_model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=200)
    models.append(cat_model)
    
    feature_importances += cat_model.get_feature_importance()

    # One-hot encode true labels with all possible classes
    ytrain_encoded = pd.get_dummies(ytrain.map(classes))
    yvalid_encoded = pd.get_dummies(yvalid.map(classes))

    # Ensure all classes are represented in the encoded DataFrames
    for class_idx in classes.values():
        if class_idx not in ytrain_encoded.columns:
            ytrain_encoded[class_idx] = 0
        if class_idx not in yvalid_encoded.columns:
            yvalid_encoded[class_idx] = 0

    # Sort columns to ensure they are in the same order
    ytrain_encoded = ytrain_encoded.reindex(sorted(ytrain_encoded.columns), axis=1)
    yvalid_encoded = yvalid_encoded.reindex(sorted(yvalid_encoded.columns), axis=1)

    # Predictions DataFrames - Adjust column names to match the encoded DataFrames
    train_preds_df = pd.DataFrame(cat_model.predict_proba(xtrain), columns=sorted(classes.values()))
    valid_preds_df = pd.DataFrame(cat_model.predict_proba(xvalid), columns=sorted(classes.values()))

    # Calculate KL Divergence
    kl_div_train = score(ytrain_encoded, train_preds_df, epsilon=10**-15, micro_average=True)
    kl_div_valid = score(yvalid_encoded, valid_preds_df, epsilon=10**-15, micro_average=True)
    
    kl_divergence_scores_train.append(kl_div_train)
    kl_divergence_scores_valid.append(kl_div_valid)

    print(f"Fold {fold_num} - Training KL Divergence: {kl_div_train:.4f}, Validation KL Divergence: {kl_div_valid:.4f}")    
    fold_num += 1

# Overall KL Divergence
overall_kl_div_train = np.mean(kl_divergence_scores_train)
overall_kl_div_valid = np.mean(kl_divergence_scores_valid)

print(f"\nOverall KL Divergence for Training: {overall_kl_div_train:.4f}, Validation: {overall_kl_div_valid:.4f}")

# Feature Importance
# feature_importances /= gkf.get_n_splits()
feature_importances /= skf.get_n_splits()

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False).head(20)

plt.figure(figsize=(10, 8))
sns.barplot(data=importance_df, y='Feature', x='Importance', palette='GnBu_r')
plt.title('Top 20 Most Important Features')
plt.show()

In [None]:
# Fold 1 - Training KL Divergence: 0.1964, Validation KL Divergence: 0.7484
# Fold 2 - Training KL Divergence: 0.2075, Validation KL Divergence: 0.7401
# Fold 3 - Training KL Divergence: 0.1525, Validation KL Divergence: 0.7860
# Fold 4 - Training KL Divergence: 0.1458, Validation KL Divergence: 0.7348
# Fold 5 - Training KL Divergence: 0.1816, Validation KL Divergence: 0.7741
# Fold 6 - Training KL Divergence: 0.1672, Validation KL Divergence: 0.7503
# Fold 7 - Training KL Divergence: 0.1778, Validation KL Divergence: 0.7313
# Fold 8 - Training KL Divergence: 0.1925, Validation KL Divergence: 0.7695
# Fold 9 - Training KL Divergence: 0.1501, Validation KL Divergence: 0.7411
# Fold 10 - Training KL Divergence: 0.1565, Validation KL Divergence: 0.7917

# Overall KL Divergence for Training: 0.1728, Validation: 0.7567

In [None]:
# FEATURE ENGINEER TEST
PATH2 = '/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms/'
data = np.zeros((len(test),len(feature_names)))
    
for k in range(len(test)):
    row = test.iloc[k]
    s = int( row.spectrogram_id )
    spec = pd.read_parquet(f'{PATH2}{s}.parquet')
    
    # 10 MINUTE WINDOW FEATURES
    x = np.nanmean( spec.iloc[:,1:].values, axis=0)
    data[k,:400] = x
    x = np.nanmin( spec.iloc[:,1:].values, axis=0)
    data[k,400:800] = x

    # 20 SECOND WINDOW FEATURES
    x = np.nanmean( spec.iloc[145:155,1:].values, axis=0)
    data[k,800:1200] = x
    x = np.nanmin( spec.iloc[145:155,1:].values, axis=0)
    data[k,1200:1600] = x

test[FEATURES] = data
print('New test shape',test.shape)

In [None]:
num_classes = len(classes)
columns_order = X.columns.tolist()
test = test[columns_order]

test_preds_cat = np.zeros((test.shape[0], num_classes))

for cat_model in models:
    test_preds_cat += cat_model.predict_proba(test)

test_preds_cat /= len(models)

In [None]:
columns_to_fill = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
for i, col in enumerate(columns_to_fill):
    sub[col] = test_preds_cat[:, i]

In [None]:
sub

In [None]:
# SANITY CHECK TO CONFIRM PREDICTIONS SUM TO ONE
sub.iloc[:,-6:].sum(axis=1)

In [None]:
sub.to_csv('submission.csv', index=False)