In [2]:
# Install required packages with SciPy upgrade
!pip install pyriemann scikit-learn seaborn tqdm joblib
!pip install --upgrade scipy  # Fixes the ufunc error

import numpy as np
import pandas as pd
import os
import joblib
from scipy.linalg import sqrtm, inv
from pyriemann.estimation import Covariances
from pyriemann.utils.mean import mean_riemann
from pyriemann.tangentspace import tangent_space
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Configuration
BASE_PATH = '/kaggle/input/preprocessed/mtc-aic3_dataset_preprocessed'
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Load and filter data
print("Loading and filtering data...")
train_df = pd.read_csv(os.path.join(BASE_PATH, 'train.csv'))
val_df = pd.read_csv(os.path.join(BASE_PATH, 'validation.csv'))

# Filter for MI tasks only
train_df = train_df[train_df['task'] == 'MI']
val_df = val_df[val_df['task'] == 'MI']

print(f"Training set size (MI only): {len(train_df)}")
print(f"Validation set size (MI only): {len(val_df)}")

train_df['set_type'] = 'train'
val_df['set_type'] = 'validation'
train_val_df = pd.concat([train_df, val_df], ignore_index=True)
print(f"Combined training+validation size: {len(train_val_df)}")

# Label encoder
le = LabelEncoder()
le.fit(train_val_df['label'])

# Function to load EEG trial data with NaN handling using MEDIAN
def load_trial_data(row):
    # Determine path based on set type
    if row['set_type'] == 'train':
        eeg_path = f"{BASE_PATH}/MI/train/{row['subject_id']}/{row['trial_session']}/EEGdata.csv"
    elif row['set_type'] == 'validation':
        eeg_path = f"{BASE_PATH}/MI/validation/{row['subject_id']}/{row['trial_session']}/EEGdata.csv"
    else:
        raise ValueError(f"Unknown set type: {row['set_type']}")
    
    # Load and extract trial
    eeg_data = pd.read_csv(eeg_path)
    start_idx = (row['trial'] - 1) * 375
    end_idx = start_idx + 375
    trial_data = eeg_data.iloc[start_idx:end_idx].drop(columns=['Time'])
    
    # Convert to numpy array and handle NaNs
    data = trial_data.values
    
    # Replace NaNs with channel MEDIAN (better for EEG data)
    if np.isnan(data).any():
        channel_medians = np.nanmedian(data, axis=0)  # MEDIAN instead of mean
        nan_indices = np.where(np.isnan(data))
        data[nan_indices] = np.take(channel_medians, nan_indices[1])
    
    return data

# Load training data with progress bar
X_train_val = []
y_train_val = []
print("\nLoading training/validation trials...")
for _, row in tqdm(train_val_df.iterrows(), total=len(train_val_df)):
    eeg_data = load_trial_data(row)
    
    # Additional NaN check after replacement
    if np.isnan(eeg_data).any():
        print(f"Warning: NaNs still present in trial {row['trial']} of session {row['subject_id']}/{row['trial_session']}")
        eeg_data = np.nan_to_num(eeg_data)  # Fallback to zero replacement
    
    X_train_val.append(eeg_data)
    y_train_val.append(row['label'])
y_train_val_encoded = le.transform(y_train_val)

# Compute covariance matrices
print("\nComputing covariance matrices...")
cov_estimator = Covariances(estimator='lwf')
covs_train_val = []
for x in tqdm(X_train_val):
    # Check for NaNs again before covariance calculation
    if np.isnan(x).any():
        x = np.nan_to_num(x)
    
    cov = cov_estimator.fit_transform(x[np.newaxis, :, :])[0]
    covs_train_val.append(cov)
covs_train_val = np.array(covs_train_val)

# Compute Riemannian mean
print("\nCalculating Riemannian mean...")
M = mean_riemann(covs_train_val)

# Align covariance matrices using ACM
print("\nAligning covariance matrices (ACM)...")
M_inv_sqrt = np.real(inv(sqrtm(M)))
M_sqrt = np.real(sqrtm(M))
covs_aligned = []
for cov in tqdm(covs_train_val):
    tmp = M_inv_sqrt @ cov @ M_inv_sqrt
    tmp = np.real(sqrtm(tmp))
    cov_aligned = M_sqrt @ tmp @ M_sqrt
    covs_aligned.append(np.real(cov_aligned))
covs_aligned = np.array(covs_aligned)

# Map to tangent space
print("\nMapping to tangent space...")
features_train_val = tangent_space(covs_aligned, M)

# Train SVM model
print("\nTraining SVM classifier...")
svm = SVC(kernel='linear', C=1.0, random_state=RANDOM_STATE, probability=True)
svm.fit(features_train_val, y_train_val_encoded)

# Evaluate on training data
print("\nEvaluating model...")
y_pred_encoded = svm.predict(features_train_val)
y_pred = le.inverse_transform(y_pred_encoded)

# Calculate metrics
f1 = f1_score(y_train_val, y_pred, average='weighted')
cls_report = classification_report(y_train_val, y_pred)
conf_matrix = confusion_matrix(y_train_val, y_pred)

print(f"\nF1 Score: {f1:.4f}")
print("\nClassification Report:")
print(cls_report)
print("\nConfusion Matrix:")
print(conf_matrix)

# Save training logs
with open('training_logs.txt', 'w') as f:
    f.write(f"F1 Score: {f1:.4f}\n\n")
    f.write("Classification Report:\n")
    f.write(cls_report + "\n\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(conf_matrix))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.close()

# Process test data
test_df = pd.read_csv(os.path.join(BASE_PATH, 'test.csv'))
test_df = test_df[test_df['task'] == 'MI']  # Filter for MI tasks
test_df['set_type'] = 'test'

X_test = []
test_ids = []
print("\nLoading test trials...")
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    eeg_data = load_trial_data(row)
    
    # Handle NaNs in test data using MEDIAN
    if np.isnan(eeg_data).any():
        channel_medians = np.nanmedian(eeg_data, axis=0)  # MEDIAN instead of mean
        nan_indices = np.where(np.isnan(eeg_data))
        eeg_data[nan_indices] = np.take(channel_medians, nan_indices[1])
        
        # Fallback if still NaNs
        if np.isnan(eeg_data).any():
            eeg_data = np.nan_to_num(eeg_data)
    
    X_test.append(eeg_data)
    test_ids.append(row['id'])

# Compute test covariances
print("\nComputing test covariance matrices...")
covs_test = []
for x in tqdm(X_test):
    # Final NaN check
    if np.isnan(x).any():
        x = np.nan_to_num(x)
    
    cov = cov_estimator.fit_transform(x[np.newaxis, :, :])[0]
    covs_test.append(cov)
covs_test = np.array(covs_test)

# Align test covariances
print("\nAligning test matrices (ACM)...")
covs_test_aligned = []
for cov in tqdm(covs_test):
    tmp = M_inv_sqrt @ cov @ M_inv_sqrt
    tmp = np.real(sqrtm(tmp))
    cov_aligned = M_sqrt @ tmp @ M_sqrt
    covs_test_aligned.append(np.real(cov_aligned))
covs_test_aligned = np.array(covs_test_aligned)

# Map test data to tangent space
print("\nMapping test data to tangent space...")
features_test = tangent_space(covs_test_aligned, M)

# Generate predictions
print("\nGenerating test predictions...")
test_preds_encoded = svm.predict(features_test)
test_preds = le.inverse_transform(test_preds_encoded)

# Create submission file
submission_df = pd.DataFrame({'id': test_ids, 'label': test_preds})
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission file created: submission.csv")

# Save model components
joblib.dump({
    'M': M,
    'cov_estimator': cov_estimator,
    'svm': svm,
    'label_encoder': le
}, 'model.joblib')
print("Model components saved: model.joblib")

Collecting scipy
  Downloading scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.2
    Uninstalling scipy-1.15.2:
      Successfully uninstalled scipy-1.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.3.1 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1

ModuleNotFoundError: No module named 'array_api_compat'