# EEG Analysis

In [37]:
import pandas as pd
import numpy as np

# Step 1: Load the dataset including the first three rows
raw_eeg = pd.read_csv('Data/In Data.csv', header=None, low_memory=False)

# Step 2: Extract the three header rows
header1 = raw_eeg.iloc[0]  # First row: FFT Bands
header2 = raw_eeg.iloc[1]  # Second row: Frequencies
header3 = raw_eeg.iloc[2]  # Third row: Time format note (mainly first column)

# Step 3: Build new headers
new_columns = []

for col in range(len(header1)):
    if col == 0:
        # First column is Time
        new_columns.append('Time')
    else:
        # Concatenate: Band + "_" + Frequency
        new_header = f"{header1[col]}_{header2[col]}"
        new_columns.append(new_header)

# Step 4: Assign the new headers & clean data
clean_eeg = raw_eeg.iloc[3:].copy()  # Drop the first three rows
clean_eeg.columns = new_columns      # Assign the new combined header
clean_eeg.reset_index(drop=True, inplace=True)  # Reset index

# Step 5: Convert signal columns to numeric
for col in clean_eeg.columns[1:]:  # Skip 'Time'
    clean_eeg[col] = pd.to_numeric(clean_eeg[col], errors='coerce')

# Final check
print(clean_eeg.info())
print(clean_eeg.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18638 entries, 0 to 18637
Data columns (total 41 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Time        18638 non-null  object 
 1   Delta_'F3'  18638 non-null  float64
 2   Delta_'Fz'  18638 non-null  float64
 3   Delta_'F4'  18638 non-null  float64
 4   Delta_'C3'  18638 non-null  float64
 5   Delta_'C4'  18638 non-null  float64
 6   Delta_'Pz'  18638 non-null  float64
 7   Delta_'O1'  18638 non-null  float64
 8   Delta_'O2'  18638 non-null  float64
 9   Theta_'F3'  18638 non-null  float64
 10  Theta_'Fz'  18638 non-null  float64
 11  Theta_'F4'  18638 non-null  float64
 12  Theta_'C3'  18638 non-null  float64
 13  Theta_'C4'  18638 non-null  float64
 14  Theta_'Pz'  18638 non-null  float64
 15  Theta_'O1'  18638 non-null  float64
 16  Theta_'O2'  18638 non-null  float64
 17  Alpha_'F3'  18638 non-null  float64
 18  Alpha_'Fz'  18638 non-null  float64
 19  Alpha_'F4'  18638 non-nul

In [38]:
clean_eeg.sample(10)

Unnamed: 0,Time,Delta_'F3',Delta_'Fz',Delta_'F4',Delta_'C3',Delta_'C4',Delta_'Pz',Delta_'O1',Delta_'O2',Theta_'F3',...,Beta_'O1',Beta_'O2',Gamma_'F3',Gamma_'Fz',Gamma_'F4',Gamma_'C3',Gamma_'C4',Gamma_'Pz',Gamma_'O1',Gamma_'O2'
4363,'[15:10:14.077 31/03/2023]',0.42,1.07,0.62,1.66,0.36,0.93,0.52,0.68,1.49,...,3.19,7.12,0.11,0.28,0.13,0.39,0.08,0.26,0.14,0.22
2125,'[15:09:36.782 31/03/2023]',0.33,0.68,0.56,0.66,0.42,0.63,0.31,1.12,1.14,...,1.91,11.25,0.08,0.27,0.12,0.1,0.11,0.15,0.09,0.44
5240,'[15:10:28.693 31/03/2023]',0.58,2.12,0.65,1.25,0.61,2.16,0.56,1.35,2.02,...,1.94,6.62,0.1,0.83,0.16,0.24,0.1,0.71,0.12,0.26
5845,'[15:10:38.775 31/03/2023]',0.46,1.37,0.73,1.69,0.48,1.51,0.91,2.41,2.36,...,1.64,22.02,0.1,0.42,0.19,0.36,0.09,0.57,0.12,0.61
6254,'[15:10:45.592 31/03/2023]',0.17,0.2,0.18,0.3,0.17,0.13,0.28,0.28,0.63,...,3.6,1.53,0.11,0.11,0.09,0.17,0.09,0.12,0.16,0.12
3561,'[15:10:00.713 31/03/2023]',0.31,1.35,0.52,1.45,0.26,2.25,0.28,0.7,0.81,...,3.36,9.54,0.08,0.44,0.12,0.4,0.07,0.65,0.15,0.49
7221,'[15:11:01.707 31/03/2023]',0.43,1.01,1.59,1.65,0.52,1.22,1.5,1.67,1.37,...,6.65,7.34,0.13,0.26,0.56,0.56,0.12,0.3,0.46,0.43
10896,'[15:12:02.950 31/03/2023]',0.21,1.28,0.46,1.81,0.14,0.71,0.18,2.17,1.15,...,1.29,7.09,0.12,0.47,0.18,0.49,0.13,0.42,0.14,0.51
17412,'[15:13:51.540 31/03/2023]',0.3,0.62,0.63,1.07,0.33,0.39,0.38,1.05,2.08,...,2.19,8.1,0.08,0.14,0.17,0.22,0.1,0.14,0.15,0.27
1790,'[15:09:31.198 31/03/2023]',0.35,0.35,0.29,0.41,0.32,0.38,0.42,0.43,0.6,...,6.18,5.05,0.14,0.11,0.11,0.13,0.11,0.14,0.18,0.15


Handling Time Column

In [42]:
# Step 5: Clean the Time column (remove square brackets and strip spaces)
clean_eeg['Time'] = (
    clean_eeg['Time']
    .str.replace(r'[\[\]\']', '', regex=True)          # Remove [ ] and ' characters
    .str.replace(r'\s*\d{2}/\d{2}/\d{4}', '', regex=True)  # Remove date (e.g., 31/03/2023)
    .str.strip()                                       # Remove any remaining whitespace
)

clean_eeg.sample(20)

Unnamed: 0,Time,Delta_'F3',Delta_'Fz',Delta_'F4',Delta_'C3',Delta_'C4',Delta_'Pz',Delta_'O1',Delta_'O2',Theta_'F3',...,Beta_'O1',Beta_'O2',Gamma_'F3',Gamma_'Fz',Gamma_'F4',Gamma_'C3',Gamma_'C4',Gamma_'Pz',Gamma_'O1',Gamma_'O2'
12391,15:12:27.864,0.55,1.47,1.3,1.97,0.66,1.47,0.83,1.61,2.05,...,2.72,7.09,0.12,0.5,0.27,0.48,0.08,0.39,0.14,0.58
10697,15:11:59.634,0.41,0.79,0.89,1.68,0.39,1.09,0.6,1.49,2.76,...,4.15,13.16,0.13,0.24,0.28,0.43,0.13,0.29,0.16,0.45
3558,15:10:00.663,0.31,1.22,0.52,1.14,0.27,2.07,0.32,0.7,0.84,...,3.51,8.87,0.08,0.4,0.12,0.39,0.07,0.61,0.15,0.49
1338,15:09:23.666,0.24,0.27,0.24,0.55,0.27,0.3,0.33,0.62,0.78,...,5.04,4.26,0.11,0.12,0.1,0.17,0.12,0.12,0.12,0.2
64,15:09:02.436,1.57,3.98,5.48,6.46,1.18,4.91,4.85,2.47,7.1,...,19.77,22.66,0.27,1.02,1.58,1.33,0.31,0.91,1.39,1.02
10227,15:11:51.802,0.96,1.17,2.08,4.65,1.0,1.74,1.23,3.41,5.65,...,9.64,5.16,0.19,0.65,0.41,0.72,0.17,0.58,0.28,0.43
15821,15:13:25.025,0.83,2.52,1.44,1.6,0.98,1.54,1.02,2.51,2.54,...,4.71,11.33,0.19,0.96,0.2,0.42,0.23,0.21,0.18,0.54
12572,15:12:30.881,0.37,0.93,0.56,1.96,0.2,0.81,0.87,1.76,0.99,...,4.22,8.12,0.1,0.95,0.15,0.48,0.1,0.64,0.27,0.33
8395,15:11:21.272,0.13,0.38,0.39,0.69,0.12,0.27,0.24,0.52,1.03,...,2.02,2.81,0.13,0.2,0.1,0.21,0.1,0.21,0.17,0.18
17055,15:13:45.590,0.62,1.46,0.83,2.44,0.54,1.06,0.56,2.66,2.32,...,2.08,8.05,0.12,0.43,0.33,0.29,0.13,0.24,0.21,0.39


#### Determining Sampling Frequency useful in EDA for EEG Analysis

In [43]:
# Convert Time back to timedelta in seconds for sampling estimation
time_series = pd.to_datetime(clean_eeg['Time'].astype(str), format='%H:%M:%S.%f', errors='coerce')

# Calculate average time difference in seconds
time_diffs = time_series.diff().dropna().dt.total_seconds()
sfreq = round(1 / time_diffs.mean(), 2)  # Average samples per second

print(f"Detected Sampling Frequency: {sfreq} Hz")

Detected Sampling Frequency: 60.01 Hz


In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mne import create_info, EpochsArray
from mne.time_frequency import tfr_multitaper
from mne.viz import plot_topomap
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# Summary statistics
print("Summary statistics for each frequency band:")
display(clean_eeg.groupby('Band')['Amplitude'].describe())

# Plot distribution of amplitudes by band
plt.figure(figsize=(12, 6))
sns.boxplot(x='Time', y='Amplitude', data=clean_eeg)
plt.title('Distribution of Amplitude Values by Frequency Band')
plt.xticks(rotation=45)
plt.show()

Summary statistics for each frequency band:


KeyError: 'Band'

In [44]:
import mne
# Prepare for MNE: Extract EEG data & metadata
channel_names = clean_eeg.columns[1:].tolist()     # All columns except 'Time'
channel_types = ['eeg'] * len(channel_names)

# Transpose EEG data: shape must be [n_channels, n_samples]
eeg_values = clean_eeg.iloc[:, 1:].T.values

# Create MNE Info and RawArray
info = mne.create_info(ch_names=channel_names, sfreq=sfreq, ch_types=channel_types)
raw = mne.io.RawArray(eeg_values, info)

# Plot EEG Overview
raw.plot(n_channels=min(20, len(channel_names)), scalings='auto', title='EEG Raw Data Overview')

Creating RawArray with float64 data, n_channels=40, n_times=18638
    Range : 0 ... 18637 =      0.000 ...   310.565 secs
Ready.


<mne_qt_browser._pg_figure.MNEQtBrowser at 0x2d71398fd10>

In [10]:
import pandas as pd
import numpy as np
import mne

# Load and clean your EEG data
raw_eeg_data = pd.read_csv('Data/In Data.csv')
numeric_eeg_data = raw_eeg_data.drop(index=[0,1]).reset_index(drop=True)

for col in numeric_eeg_data.columns:
    if col != "'FFT Bands'":
        numeric_eeg_data[col] = pd.to_numeric(numeric_eeg_data[col], errors='coerce')

clean_eeg_data = numeric_eeg_data.dropna().reset_index(drop=True)

# Prepare data for MNE
channel_names = clean_eeg_data.columns[1:].tolist()  # skip 'FFT Bands'
channel_types = ['eeg'] * len(channel_names)
eeg_values = clean_eeg_data.iloc[:, 1:].T.values

# Sampling frequency assumption (adjust to match your data if you know it!)
sfreq = 128.0  

# Create MNE Info & RawArray
info = mne.create_info(ch_names=channel_names, sfreq=sfreq, ch_types=channel_types)
raw = mne.io.RawArray(eeg_values, info)

# Plot EEG overview
raw.plot(n_channels=20, scalings='auto', title='EEG Raw Data Overview')


  raw_eeg_data = pd.read_csv('Data/In Data.csv')


Creating RawArray with float64 data, n_channels=40, n_times=18638
    Range : 0 ... 18637 =      0.000 ...   145.602 secs
Ready.


<mne_qt_browser._pg_figure.MNEQtBrowser at 0x2d7013af1d0>

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'clean_eeg_data' from previous step
# Convert timestamp to datetime if needed
clean_eeg_data['Time'] = pd.to_datetime(clean_eeg_data["'FFT Bands'"].str.replace('[\[\]]', '', regex=True), errors='coerce')

# Plot: Histogram for each band type
band_prefixes = ['Delta', 'Theta', 'Alpha', 'Beta', 'Gamma']

plt.figure(figsize=(16, 8))
for idx, band in enumerate(band_prefixes):
    plt.subplot(2, 3, idx + 1)
    band_columns = [col for col in clean_eeg_data.columns if col.startswith(band)]
    sns.histplot(clean_eeg_data[band_columns].values.flatten(), bins=100, kde=True)
    plt.title(f'{band} Distribution')
plt.tight_layout()
plt.show()

# Correlation Matrix
plt.figure(figsize=(12, 10))
corr = clean_eeg_data.iloc[:, 1:].corr()  # skip timestamp
sns.heatmap(corr, cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix of EEG Features')
plt.show()

# Time Series Plot for selected bands
plt.figure(figsize=(14, 6))
sample_columns = ['Alpha', 'Beta', 'Theta', 'Gamma', 'Delta']
for band in sample_columns:
    band_cols = [col for col in clean_eeg_data.columns if col.startswith(band)]
    plt.plot(clean_eeg_data['Time'], clean_eeg_data[band_cols].mean(axis=1), label=band)
plt.legend()
plt.xlabel('Time')
plt.ylabel('Average Band Power')
plt.title('EEG Band Power Over Time')
plt.show()

# Boxplots: Detect Outliers
plt.figure(figsize=(16, 8))
sns.boxplot(data=clean_eeg_data.iloc[:, 1:], orient='h')
plt.title('EEG Feature Distribution & Outliers')
plt.show()


  clean_eeg_data['Time'] = pd.to_datetime(clean_eeg_data["'FFT Bands'"].str.replace('[\[\]]', '', regex=True), errors='coerce')
  clean_eeg_data['Time'] = pd.to_datetime(clean_eeg_data["'FFT Bands'"].str.replace('[\[\]]', '', regex=True), errors='coerce')


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import pandas as pd

# Make sure your 'Time' column is properly converted to datetime first!
clean_eeg_data['Time'] = pd.to_datetime(clean_eeg_data['Time'], errors='coerce')

# Drop rows with invalid timestamps
clean_eeg_data = clean_eeg_data.dropna(subset=['Time']).reset_index(drop=True)

# Standardize EEG data (skip timestamp column)
X = clean_eeg_data.iloc[:, 1:].drop(columns=['Cluster'], errors='ignore').values  # exclude 'Cluster' if rerunning
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dimensionality Reduction with PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot PCA Components
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], alpha=0.4)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('EEG Signal: PCA Projection')
plt.show()

# Apply KMeans Clustering (let’s assume 3 cognitive states as a start)
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Plot Clusters
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=clusters, palette='tab10')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('EEG Brain Activity Clusters')
plt.legend(title='Cluster')
plt.show()

# Assign clusters to original data
clean_eeg_data['Cluster'] = clusters

# Plot Clustered Brain Activity Over Time
plt.figure(figsize=(12, 6))
plt.plot(clean_eeg_data['Time'], clean_eeg_data['Cluster'], linestyle='-', marker='.', alpha=0.7)
plt.xlabel('Time')
plt.ylabel('Detected Brain State Cluster')
plt.title('EEG Brain State Evolution Over Time')

# Format the x-axis to display readable time
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
plt.gcf().autofmt_xdate()
plt.show()

TypeError: float() argument must be a string or a real number, not 'Timestamp'