In [None]:
# =========================================
# EEG Phoneme & Vowel/Consonant Analysis
# =========================================

# --- Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc

sns.set(style="whitegrid")

# --- Load Preprocessed Data ---
X = pd.read_csv("../processed_data/X.csv").values
y = pd.read_csv("../processed_data/y.csv").values.flatten()

with open("../processed_data/labels.txt") as f:
    phoneme_labels = f.read().strip().split(",")

# Map numeric y -> phoneme labels
y_phonemes = [phoneme_labels[label] for label in y]

# --- Vowel vs Consonant ---
vowel_set = {"a","e","i","o","u"}
y_grouped = ["vowel" if p in vowel_set else "consonant" for p in y_phonemes]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_grouped)
class_names = label_encoder.classes_

# =========================================
# 1️⃣ Data Exploration
# =========================================

# --- a) Single phoneme distribution ---
plt.figure(figsize=(12,4))
sns.countplot(y_phonemes)
plt.xticks(rotation=90)
plt.title("Single Phoneme Distribution")
plt.show()

# --- b) Vowel vs Consonant distribution ---
plt.figure(figsize=(6,4))
sns.countplot(y_grouped)
plt.title("Vowel vs Consonant Distribution")
plt.show()

# =========================================
# 2️⃣ Feature Analysis
# =========================================

# --- PCA projection ---
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y_encoded, cmap='coolwarm', alpha=0.6)
plt.title("PCA Projection of EEG Features (Vowel vs Consonant)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# =========================================
# 3️⃣ Model Evaluation
# =========================================

# Here you should load your model predictions
# For example, from train.py outputs, assume:
# - `y_pred_single` for single phoneme
# - `y_pred_vowel` for vowel/consonant
# If you don’t have them saved, you can compute using your trained model as below:

from sklearn.ensemble import RandomForestClassifier

# Train RF only for analysis (optional if predictions already exist)
rf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
rf.fit(X, y_encoded)
y_pred_vowel = rf.predict(X)
y_pred_prob_vowel = rf.predict_proba(X)[:,1]

accuracy_vowel = accuracy_score(y_encoded, y_pred_vowel)

# --- Confusion Matrix (Vowel vs Consonant) ---
cm = confusion_matrix(y_encoded, y_pred_vowel)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title(f"Confusion Matrix (Vowel vs Consonant, Acc={accuracy_vowel:.2f})")
plt.show()

# --- Feature Importance ---
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1][:20]  # top 20 features
plt.figure(figsize=(12,4))
sns.barplot(x=indices, y=importances[indices])
plt.title("Top 20 Feature Importances (Random Forest)")
plt.xlabel("Feature Index")
plt.ylabel("Importance")
plt.show()

# --- ROC Curve ---
fpr, tpr, _ = roc_curve(y_encoded, y_pred_prob_vowel)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve: Vowel vs Consonant")
plt.legend()
plt.show()

# --- Classification Report ---
print(f"Accuracy (Vowel vs Consonant): {accuracy_vowel:.4f}")
print("\nClassification Report:")
print(classification_report(y_encoded, y_pred_vowel, target_names=class_names))

# =========================================
# 4️⃣ Summary & Insights
# =========================================

print("""
Summary & Insights:

- Single phoneme prediction has low accuracy due to high class imbalance and limited samples per phoneme.
- Vowel vs Consonant grouping significantly improves accuracy and allows easier interpretation.
- PCA shows clustering trends between vowels and consonants.
- Feature importance highlights the most informative EEG channels or statistical features.
- Confusion matrix and ROC curves visualize model performance clearly.
""")


FileNotFoundError: [Errno 2] No such file or directory: 'processed_data/X.csv'