In [None]:
!unzip /content/archive.zip

unzip:  cannot find or open /content/archive.zip, /content/archive.zip.zip or /content/archive.zip.ZIP.


In [None]:
!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!ls /content

archive.zip					 consolidated  test
autism-2.txt					 drive	       train
autism-S-224-89.33.h5				 kaggle.json   valid
autistic-children-facial-data-set-metadata.json  sample_data


In [None]:
from torch.utils.data import Dataset, DataLoader


In [None]:
import os  # For directory and file path manipulations
from PIL import Image  # For image loading and processing
import torch  # Main PyTorch library for tensor operations
import torch.nn as nn  # Neural network related modules
import torch.optim as optim  # Optimization algorithms like Adam
from torch.utils.data import Dataset, DataLoader, random_split  # Dataset and utilities for batching and splitting
import torchvision.transforms as transforms  # Image augmentation and preprocessing
from torchvision.models import resnet18  # Pretrained ResNet-18 model
from timm.models.vision_transformer import vit_base_patch16_224  # Pretrained Vision Transformer model

# Device configuration: choose GPU if available else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Hyperparameters and constants
num_classes = 7  # Number of emotion classes in dataset
batch_size = 32  # Mini-batch size for training/validation
learning_rate = 3e-4  # Initial learning rate for optimizer
num_epochs = 30  # Number of training epochs
image_size = 224  # Image size input to ViT and ResNet models


# Compose image preprocessing transformations: resize, convert to tensor, normalize
transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),  # Resize input images to 224x224
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Normalize using ImageNet means
                         std=[0.229, 0.224, 0.225])   # Normalize using ImageNet standard deviations
])


# Custom PyTorch Dataset class to handle dataset organized by class subfolders
class AutismFacialDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        Initialize dataset by scanning images under root_dir organized in class folders.

        Args:
            root_dir (str): Directory containing class-specific subfolders of images.
            transform (callable, optional): Image transformations to apply.
        """
        self.root_dir = root_dir  # Root dataset directory path
        self.transform = transform  # Transform pipeline for images
        self.samples = []  # List to hold tuples (image_path, label)
        self.classes = []  # List of class folder names (e.g., 'ASD', 'Non-ASD')
        self.class_to_idx = {}  # Mapping from class name to numerical label

        # Discover class folders sorted alphabetically
        self.classes = sorted(entry.name for entry in os.scandir(root_dir) if entry.is_dir())
        # Create dict mapping class name to integer label
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}

        # Walk through each class folder to collect image file paths and assign labels
        for cls_name in self.classes:
            cls_dir = os.path.join(root_dir, cls_name)  # Full path to class folder
            for fname in os.listdir(cls_dir):  # Iterate over files in class folder
                fpath = os.path.join(cls_dir, fname)
                # Make sure file is image with allowed extensions
                if os.path.isfile(fpath) and fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                    self.samples.append((fpath, self.class_to_idx[cls_name]))  # Append path and label

    def __len__(self):
        # Return total number of image samples
        return len(self.samples)

    def __getitem__(self, idx):
        # Get image path and label at index idx
        img_path, label = self.samples[idx]
        # Open image and convert to RGB
        image = Image.open(img_path).convert('RGB')
        # Apply transformations if any
        if self.transform:
            image = self.transform(image)
        # Return image tensor and corresponding label
        return image, label


# Root directory where dataset is extracted and arranged in class folders
dataset_path = './data'  # Change if needed according to the folder structure

# Instantiate custom dataset with images and transformations applied
full_dataset = AutismFacialDataset(root_dir=dataset_path, transform=transform)

# Calculate the dataset sizes for train, validation, and test splits
total_size = len(full_dataset)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size  # To account for rounding

# Split the dataset randomly into train, validation, and test subsets
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    full_dataset, [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)  # Ensures reproducibility
)

# Create DataLoader objects for each subset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Define the facial expression recognition model combining ResNet and ViT backbones
class FER_ViT(nn.Module):
    def __init__(self, num_classes=num_classes):
        super(FER_ViT, self).__init__()
        # Load pretrained ResNet-18 as local feature extractor
        self.backbone = resnet18(pretrained=True)
        self.backbone.fc = nn.Identity()  # Remove final classification layer

        # Load pretrained Vision Transformer as global feature extractor
        self.vit = vit_base_patch16_224(pretrained=True)

        # Freeze ResNet parameters to prevent updating during training
        for param in self.backbone.parameters():
            param.requires_grad = False
        # Allow ViT parameters to be learned/fine-tuned
        for param in self.vit.parameters():
            param.requires_grad = True

        # Fully connected layer to fuse ResNet (512) and ViT (768) features to 512 features
        self.feature_fusion = nn.Linear(512 + 768, 512)

        # Final classifier layers with non-linearity and dropout regularization
        self.classifier = nn.Sequential(
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        # Extract local features using ResNet backbone
        local_features = self.backbone(x)
        # Extract global features using ViT backbone
        vit_features = self.vit.forward_features(x)
        # Concatenate features from both backbones
        combined_features = torch.cat((local_features, vit_features), dim=1)
        # Fuse concatenated features to compact vector
        fused = self.feature_fusion(combined_features)
        # Pass fused features through classifier to get final logits
        out = self.classifier(fused)
        return out

# Instantiate model and send to computation device (GPU/CPU)
model = FER_ViT().to(device)


# Define loss function as Cross-Entropy (appropriate for multi-class classification)
criterion = nn.CrossEntropyLoss()
# Define Adam optimizer with configured learning rate
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Learning rate scheduler to decay LR by half every 10 epochs
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training function for one epoch
def train():
    model.train()  # Set model to training mode
    total_loss = 0
    correct = 0
    total = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)  # Move data to device
        outputs = model(images)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        optimizer.zero_grad()  # Reset previous gradients
        loss.backward()  # Backpropagation to compute gradients
        optimizer.step()  # Update model weights
        total_loss += loss.item() * images.size(0)  # Accumulate loss weighted by batch size
        _, predicted = outputs.max(1)  # Get predictions from logits
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()  # Count correct predictions
    avg_loss = total_loss / total  # Average loss per sample
    accuracy = 100.0 * correct / total  # Accuracy percentage
    return avg_loss, accuracy


# Validation function to evaluate model performance
def validate():
    model.eval()  # Set model to evaluation mode (affects dropout, batchnorm)
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():  # Disable gradient computation for validation
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            total_loss += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    avg_loss = total_loss / total
    accuracy = 100.0 * correct / total
    return avg_loss, accuracy

# Main training loop over epochs
for epoch in range(num_epochs):
    train_loss, train_acc = train()  # Train for one epoch
    val_loss, val_acc = validate()   # Validate on validation set
    scheduler.step()  # Step learning rate scheduler
    print(f"Epoch [{epoch+1}/{num_epochs}] "  # Print epoch stats
          f"Train Loss: {train_loss:.4f} Train Acc: {train_acc:.2f}% "
          f"Val Loss: {val_loss:.4f} Val Acc: {val_acc:.2f}%")


# Save the trained model state dictionary to disk
torch.save(model.state_dict(), 'fer_vit_autism.pth')
