# Emoji classification
## Imports
### Libraries

In [None]:
import os
import numpy as np
import pandas as pd 

# Deep Learning imports - using PyTorch for better compatibility
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.models as models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Print sample files from local data
print("\nSample training files:")
train_path = "./train"
if os.path.exists(train_path):
    files = os.listdir(train_path)[:5]
    for f in files:
        print(os.path.join(train_path, f))

Using device: cpu

Sample training files:
./train\00001.png
./train\00002.png
./train\00003.png
./train\00004.png
./train\00005.png


In [None]:
from pathlib import Path
from PIL import Image
from skimage import io, color
import matplotlib.pyplot as plt
from tqdm import tqdm

### Functions

In [None]:
def load_single_image(path, size=(64, 64)):
    """Load and preprocess a single emoji image."""
    img = Image.open(path).convert("RGBA")
    
    # Create white background and composite image (handles transparency)
    background = Image.new("RGB", img.size, (255, 255, 255))
    background.paste(img, mask=img.split()[3])  # Use alpha channel as mask
    img = background
    
    # Resize to target size
    img = img.resize(size, Image.LANCZOS)
    
    return np.array(img).astype("float32") / 255.0


class EmojiDataset(Dataset):
    """PyTorch Dataset for emoji classification."""
    
    def __init__(self, file_paths, labels_dict=None, label_encoder=None, 
                 transform=None, img_size=(64, 64), is_test=False):
        self.file_paths = file_paths
        self.labels_dict = labels_dict
        self.label_encoder = label_encoder
        self.transform = transform
        self.img_size = img_size
        self.is_test = is_test
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        img_path = self.file_paths[idx]
        img_id = Path(img_path).stem
        
        # Load image
        img = load_single_image(img_path, self.img_size)
        
        # Convert to tensor (H, W, C) -> (C, H, W)
        img = torch.tensor(img).permute(2, 0, 1)
        
        # Apply transforms if any
        if self.transform:
            img = self.transform(img)
        
        # Return based on mode
        if self.is_test:
            return img, img_id
        else:
            # Get label
            label_str = self.labels_dict.get(img_id, self.labels_dict.get(int(img_id)))
            label = self.label_encoder.transform([label_str])[0]
            return img, label

### Dataset

In [None]:
# Use local path (data is in the same directory as this notebook)
PATH = "./"

In [None]:
train_dir = Path(PATH + "train/")
train_files = sorted([str(p) for p in train_dir.iterdir() if p.is_file()])

In [None]:
test_dir = Path(PATH + "test/")
test_files = sorted([str(p) for p in test_dir.iterdir() if p.is_file()])
test_ids = [Path(f).stem for f in test_files]

In [None]:
y_train_df = pd.read_csv(PATH+ "train_labels.csv")

y_train_dct = dict(zip(y_train_df["Id"], y_train_df["Label"]))

In [None]:
# Create label encoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train_df["Label"])
num_classes = len(label_encoder.classes_)
print(f"Number of classes: {num_classes}")
print(f"Classes: {label_encoder.classes_}")

# Split training data for validation
train_files_split, val_files = train_test_split(
    train_files, test_size=0.15, random_state=42
)
print(f"Training samples: {len(train_files_split)}")
print(f"Validation samples: {len(val_files)}")

Number of classes: 7
Classes: ['apple' 'facebook' 'google' 'messenger' 'mozilla' 'samsung' 'whatsapp']
Training samples: 8397
Validation samples: 1482


## Training

In [None]:
# Simplified data augmentation (faster on CPU)
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.3),
])

# OPTIMIZED: Smaller images = much faster training
IMG_SIZE = (32, 32)  # Reduced from 64x64 (4x fewer pixels!)
BATCH_SIZE = 128     # Larger batches for efficiency

train_dataset = EmojiDataset(
    train_files_split, y_train_dct, label_encoder,
    transform=train_transform, img_size=IMG_SIZE, is_test=False
)
val_dataset = EmojiDataset(
    val_files, y_train_dct, label_encoder,
    transform=None, img_size=IMG_SIZE, is_test=False
)
test_dataset = EmojiDataset(
    test_files, labels_dict=None, label_encoder=None,
    transform=None, img_size=IMG_SIZE, is_test=True
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

Training batches: 66
Validation batches: 12
Test batches: 78


In [None]:
# ============================================================================
# LIGHTWEIGHT CNN MODEL - Optimized for fast CPU training
# ============================================================================

class EmojiCNN(nn.Module):
    """Lightweight CNN for fast emoji platform classification."""
    
    def __init__(self, num_classes):
        super(EmojiCNN, self).__init__()
        
        # Block 1: 32x32x3 -> 16x16x32
        self.block1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )
        
        # Block 2: 16x16x32 -> 8x8x64
        self.block2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )
        
        # Block 3: 8x8x64 -> 4x4x128
        self.block3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )
        
        # Global Average Pooling + Classifier
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes)
        )
    
    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.classifier(x)
        return x

# Initialize model
model = EmojiCNN(num_classes).to(device)
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")

EmojiCNN(
  (block1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block3): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): AdaptiveAvgPool2d(output_size=1)
    (1)

In [None]:
# ============================================================================
# TRAINING CONFIGURATION
# ============================================================================

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.002, weight_decay=1e-4)  # Higher LR for faster convergence
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

def train_epoch(model, loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    pbar = tqdm(loader, desc="Training", leave=False)
    for images, labels in pbar:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
        pbar.set_postfix({'loss': f'{loss.item():.3f}', 'acc': f'{100.*correct/total:.1f}%'})
    
    return running_loss / len(loader), 100. * correct / total


def validate(model, loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    return running_loss / len(loader), 100. * correct / total


In [None]:
# ============================================================================
# TRAINING LOOP - OPTIMIZED FOR SPEED
# ============================================================================

NUM_EPOCHS = 15  # Reduced from 50 - model converges fast
PATIENCE = 5     # Reduced from 10
best_val_acc = 0.0
patience_counter = 0

print("Starting training (optimized for CPU)...")
print("=" * 70)

for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = validate(model, val_loader, criterion)
    
    # Update learning rate
    scheduler.step(val_acc)
    current_lr = optimizer.param_groups[0]['lr']
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_emoji_model.pth')
        patience_counter = 0
        marker = " *** BEST ***"
    else:
        patience_counter += 1
        marker = ""
    
    print(f"Epoch [{epoch+1:2d}/{NUM_EPOCHS}] | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}% | "
          f"LR: {current_lr:.6f}{marker}")
    
    # Early stopping
    if patience_counter >= PATIENCE:
        print(f"\nEarly stopping at epoch {epoch+1}")
        break

# Load best model
model.load_state_dict(torch.load('best_emoji_model.pth', weights_only=True))
print("=" * 70)
print(f"Training complete! Best Validation Accuracy: {best_val_acc:.2f}%")


Starting training (optimized for CPU)...


                                                                                

Epoch [ 1/15] | Train Loss: 1.7811 | Train Acc: 25.51% | Val Loss: 1.9149 | Val Acc: 21.39% | LR: 0.002000 *** BEST ***


                                                                                

Epoch [ 2/15] | Train Loss: 1.5559 | Train Acc: 37.18% | Val Loss: 1.5853 | Val Acc: 36.64% | LR: 0.002000 *** BEST ***


                                                                                

Epoch [ 3/15] | Train Loss: 1.4284 | Train Acc: 43.27% | Val Loss: 1.5782 | Val Acc: 37.85% | LR: 0.002000 *** BEST ***


                                                                                

Epoch [ 4/15] | Train Loss: 1.3166 | Train Acc: 48.87% | Val Loss: 1.4478 | Val Acc: 45.01% | LR: 0.002000 *** BEST ***


                                                                                

Epoch [ 5/15] | Train Loss: 1.2287 | Train Acc: 53.40% | Val Loss: 1.6918 | Val Acc: 37.72% | LR: 0.002000


                                                                                

Epoch [ 6/15] | Train Loss: 1.1699 | Train Acc: 55.91% | Val Loss: 1.4481 | Val Acc: 48.04% | LR: 0.002000 *** BEST ***


                                                                                

Epoch [ 7/15] | Train Loss: 1.0883 | Train Acc: 59.59% | Val Loss: 1.7008 | Val Acc: 35.83% | LR: 0.002000


                                                                                

Epoch [ 8/15] | Train Loss: 1.0489 | Train Acc: 61.33% | Val Loss: 2.1737 | Val Acc: 27.33% | LR: 0.002000


                                                                                

Epoch [ 9/15] | Train Loss: 1.0082 | Train Acc: 62.74% | Val Loss: 3.2323 | Val Acc: 28.68% | LR: 0.002000


                                                                                

Epoch [10/15] | Train Loss: 0.9452 | Train Acc: 65.33% | Val Loss: 2.0085 | Val Acc: 42.91% | LR: 0.001000


                                                                                

Epoch [11/15] | Train Loss: 0.8745 | Train Acc: 68.08% | Val Loss: 1.3512 | Val Acc: 51.75% | LR: 0.001000 *** BEST ***


                                                                                

Epoch [12/15] | Train Loss: 0.8324 | Train Acc: 70.14% | Val Loss: 1.4396 | Val Acc: 50.54% | LR: 0.001000


                                                                                

Epoch [13/15] | Train Loss: 0.7978 | Train Acc: 71.59% | Val Loss: 1.4214 | Val Acc: 55.87% | LR: 0.001000 *** BEST ***


                                                                                

Epoch [14/15] | Train Loss: 0.7859 | Train Acc: 71.63% | Val Loss: 1.2755 | Val Acc: 55.60% | LR: 0.001000


                                                                                

Epoch [15/15] | Train Loss: 0.7769 | Train Acc: 72.69% | Val Loss: 1.2082 | Val Acc: 56.41% | LR: 0.001000 *** BEST ***
Training complete! Best Validation Accuracy: 56.41%


## Predictions


In [None]:
# ============================================================================
# MAKE PREDICTIONS ON TEST SET
# ============================================================================

def predict_test(model, test_loader, label_encoder):
    """Make predictions on test set."""
    model.eval()
    all_predictions = []
    all_ids = []
    
    with torch.no_grad():
        for images, img_ids in test_loader:
            images = images.to(device)
            outputs = model(images)
            _, predicted = outputs.max(1)
            
            # Convert predictions to labels
            pred_labels = label_encoder.inverse_transform(predicted.cpu().numpy())
            all_predictions.extend(pred_labels)
            all_ids.extend(img_ids)
    
    return all_ids, all_predictions

# Make predictions
print("Making predictions on test set...")
test_ids_pred, y_test_pred = predict_test(model, test_loader, label_encoder)
print(f"Total predictions: {len(y_test_pred)}")
print(f"Sample predictions: {y_test_pred[:10]}")


Making predictions on test set...
Total predictions: 9879
Sample predictions: ['google', 'messenger', 'messenger', 'whatsapp', 'messenger', 'google', 'messenger', 'facebook', 'apple', 'mozilla']


In [None]:
# Create submission dataframe
submission_df = pd.DataFrame({
    "Id": test_ids_pred,
    "Label": y_test_pred
})

print(f"Submission shape: {submission_df.shape}")
submission_df.head(10)

Submission shape: (9879, 2)


Unnamed: 0,Id,Label
0,10001,google
1,10002,messenger
2,10003,messenger
3,10004,whatsapp
4,10005,messenger
5,10006,google
6,10007,messenger
7,10008,facebook
8,10009,apple
9,10010,mozilla


In [None]:
submission_df.to_csv("final_submission.csv", index=False)