In [None]:
# Install required packages in Colab
!pip install opencv-python scikit-learn numpy pandas matplotlib seaborn

In [None]:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle
import os
from google.colab import files

## Step 1: Define Feature Extraction (Must Match Backend)

This function must extract features EXACTLY as the backend does.

In [None]:
def extract_face_features(image_path):
    """
    Extract face features using OpenCV - MUST MATCH backend preprocessing
    Returns feature vector matching backend's preprocess_image function
    """
    # Read image
    img = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Resize to standard size (48x48)
    resized = cv2.resize(gray, (48, 48))
    
    # Normalize pixel values
    normalized = resized / 255.0
    
    # Extract histogram features
    hist = cv2.calcHist([resized], [0], None, [256], [0, 256])
    hist = hist.flatten() / hist.sum()  # Normalize histogram
    
    # Calculate statistical features
    mean_val = np.mean(normalized)
    std_val = np.std(normalized)
    
    # Edge detection features
    edges = cv2.Canny(resized, 50, 150)
    edge_density = np.sum(edges > 0) / edges.size
    
    # Combine all features into a single vector (MUST MATCH BACKEND)
    features = np.concatenate([
        normalized.flatten(),  # Pixel values (2304 features)
        hist[:64],  # Reduced histogram (64 bins)
        [mean_val, std_val, edge_density]  # Statistical features (3 features)
    ])
    
    return features

## Step 2: Load Your Dataset

**Option A: Upload your own labeled images**
- Create folders: `stressed/` and `not_stressed/`
- Put face images in respective folders
- Upload to Colab

**Option B: Use a public dataset (e.g., FER-2013)**
- Map emotions to stress/no-stress
- Angry, Sad, Fear ‚Üí Stressed (label 1)
- Happy, Neutral, Surprise ‚Üí Not Stressed (label 0)

In [None]:
# Example: Load images from folders
# Adjust paths based on your dataset structure

def load_dataset_from_folders(stressed_folder, not_stressed_folder):
    """
    Load images from folders and extract features
    """
    X = []
    y = []
    
    # Load stressed images (label 1)
    print("Loading stressed images...")
    for filename in os.listdir(stressed_folder):
        if filename.endswith(('.jpg', '.jpeg', '.png')):
            try:
                image_path = os.path.join(stressed_folder, filename)
                features = extract_face_features(image_path)
                X.append(features)
                y.append(1)  # Stressed
            except Exception as e:
                print(f"Error processing {filename}: {e}")
    
    # Load not stressed images (label 0)
    print("Loading not stressed images...")
    for filename in os.listdir(not_stressed_folder):
        if filename.endswith(('.jpg', '.jpeg', '.png')):
            try:
                image_path = os.path.join(not_stressed_folder, filename)
                features = extract_face_features(image_path)
                X.append(features)
                y.append(0)  # Not stressed
            except Exception as e:
                print(f"Error processing {filename}: {e}")
    
    return np.array(X), np.array(y)

# Load your dataset
# X, y = load_dataset_from_folders('path/to/stressed', 'path/to/not_stressed')

# OR for quick testing, create synthetic data (REPLACE WITH REAL DATA FOR PRODUCTION)
print("Creating synthetic training data for demonstration...")
print("‚ö†Ô∏è REPLACE THIS WITH REAL LABELED FACE IMAGES FOR PRODUCTION!")
n_samples = 500
n_features = 2371  # Must match feature vector size
X = np.random.randn(n_samples, n_features)
y = np.random.randint(0, 2, n_samples)

print(f"Dataset loaded: {len(X)} samples, {X.shape[1]} features")
print(f"Class distribution: Stressed={np.sum(y==1)}, Not Stressed={np.sum(y==0)}")

## Step 3: Prepare Training Data

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Step 4: Train Model

In [None]:
# Train Random Forest Classifier
print("Training Random Forest model...")
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)
print("‚úì Model training complete!")

# Cross-validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")

## Step 5: Evaluate Model

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Stressed', 'Stressed']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Not Stressed', 'Stressed'],
            yticklabels=['Not Stressed', 'Stressed'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## Step 6: Export Model as .pkl

This creates a pickle file that you can download and use in your backend.

In [None]:
# Create a pipeline that includes the scaler and model
from sklearn.pipeline import Pipeline

# Create pipeline
face_pipeline = Pipeline([
    ('scaler', scaler),
    ('classifier', model)
])

# Save model
model_filename = 'face_stress_model.pkl'
with open(model_filename, 'wb') as f:
    pickle.dump(face_pipeline, f)

print(f"‚úì Model saved as {model_filename}")
print(f"Model size: {os.path.getsize(model_filename) / 1024:.2f} KB")

# Download the model
print("\nDownloading model...")
files.download(model_filename)
print("‚úì Download complete! Place this file in backend/models/ folder")

## Step 7: Test the Model (Optional)

Verify the model works correctly before deploying.

In [None]:
# Load the saved model
with open(model_filename, 'rb') as f:
    loaded_model = pickle.load(f)

# Test with a sample
sample = X_test[0:1]
prediction = loaded_model.predict(sample)
probability = loaded_model.predict_proba(sample)

print("Test prediction:")
print(f"Predicted class: {prediction[0]} ({'Stressed' if prediction[0] == 1 else 'Not Stressed'})")
print(f"Probabilities: Not Stressed={probability[0][0]:.3f}, Stressed={probability[0][1]:.3f}")
print(f"Actual class: {y_test[0]} ({'Stressed' if y_test[0] == 1 else 'Not Stressed'})")

---

## üéâ Model Training Complete!

**Next Steps:**
1. Download the `face_stress_model.pkl` file
2. Place it in your project's `backend/models/` folder
3. The backend will automatically load and use this model
4. Train the voice model using the companion notebook

**Tips for Better Models:**
- Use a larger, diverse dataset (1000+ images)
- Balance classes (equal stressed/not stressed samples)
- Try different classifiers (SVM, XGBoost, Neural Networks)
- Tune hyperparameters using GridSearchCV
- Add data augmentation for more training samples

---