In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Load data
df = pd.read_csv("dataset.csv")

# Build character vocabulary from all plate numbers
all_text = "".join(df["plate_number"].astype(str).tolist())
unique_chars = sorted(list(set(all_text)))

# Create char <-> index mappings (reserve 0 for padding)
char_to_idx = {ch: i + 1 for i, ch in enumerate(unique_chars)}
idx_to_char = {i + 1: ch for i, ch in enumerate(unique_chars)}
char_to_idx["<PAD>"] = 0
idx_to_char[0] = "<PAD>"

vocab_size = len(char_to_idx)
print(f"Vocabulary size: {vocab_size}")
print(f"Character Mapping Example: {list(char_to_idx.items())[:5]}...")

# Encode state labels
label_encoder = LabelEncoder()
df["label_idx"] = label_encoder.fit_transform(df["state_code"])
num_classes = len(label_encoder.classes_)
print(f"Number of classes: {num_classes}")

# Train/test split with stratification
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label_idx"]
)

Vocabulary size: 39
Character Mapping Example: [(' ', 1), ('-', 2), ('0', 3), ('1', 4), ('2', 5)]...
Number of classes: 51


In [9]:
import string
df = pd.read_csv("dataset.csv")
plates = df["plate_number"].astype(str)
labels = df["state_code"].astype(str)

digits = set(string.digits)
letters = set(string.ascii_letters)

def mask(s):
    """Convert plate to format pattern: D=digit, L=letter, S=space, O=other"""
    out = []
    for ch in s:
        if ch in digits:
            out.append("D")
        elif ch in letters:
            out.append("L")
        elif ch.isspace():
            out.append("S")
        else:
            out.append("O")
    return "".join(out)

masks = plates.map(mask)

# For each mask pattern, find the most common state
best_label_per_mask = pd.crosstab(masks, labels).idxmax(axis=1)

# Baseline accuracy: predict state based on format pattern only
acc = (labels == masks.map(best_label_per_mask)).mean()
print("Mask-only baseline accuracy:", acc)

Mask-only baseline accuracy: 0.4946843137254902


In [10]:

class PlateDataset(Dataset):
    def __init__(self, dataframe, char_to_idx):
        self.data = dataframe.reset_index(drop=True)
        self.char_to_idx = char_to_idx
        self.plates = self.data["plate_number"].astype(str).values
        self.labels = self.data["label_idx"].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        plate_str = self.plates[idx]
        label = int(self.labels[idx])

        # Convert characters to indices (unknown chars default to 0)
        seq = [self.char_to_idx.get(c, 0) for c in plate_str]
        seq = torch.tensor(seq, dtype=torch.long)
        length = len(seq)

        return seq, torch.tensor(label, dtype=torch.long), length


def collate_fn(batch):
    """Pad sequences to max length within each batch"""
    seqs, labels, lengths = zip(*batch)

    lengths = torch.tensor(lengths, dtype=torch.long)
    labels = torch.stack(labels)
    padded_seqs = pad_sequence(seqs, batch_first=True, padding_value=0)

    return padded_seqs, labels, lengths


# DataLoaders
batch_size = 256

train_dataset = PlateDataset(train_df, char_to_idx)
test_dataset = PlateDataset(test_df, char_to_idx)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

In [11]:

"""
Model: Bidirectional LSTM with Attention
- Embedding layer for character-level input
- 2-layer BiLSTM for sequence encoding
- Attention mechanism to weight important characters
- Fully connected layer for state classification
"""


class LSTMAttentionClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=2, lstm_dropout=0.3, fc_dropout=0.2):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm_output_dim = hidden_dim * 2  # bidirectional

        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            batch_first=True,
            bidirectional=True,
            dropout=(lstm_dropout if n_layers > 1 else 0.0)
        )

        # Attention layers
        self.attn_w1 = nn.Linear(self.lstm_output_dim, hidden_dim)
        self.attn_tanh = nn.Tanh()
        self.attn_w2 = nn.Linear(hidden_dim, 1)
        self.attn_softmax = nn.Softmax(dim=1)

        self.dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(self.lstm_output_dim, output_dim)

    def forward(self, x, lengths):
        # Embedding
        emb = self.embedding(x)

        # Pack and run LSTM
        packed = pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.lstm(packed)
        lstm_out, _ = pad_packed_sequence(packed_out, batch_first=True)

        # Attention
        attn_energy = self.attn_tanh(self.attn_w1(lstm_out))
        attn_weights = self.attn_softmax(self.attn_w2(attn_energy))
        context = torch.sum(attn_weights * lstm_out, dim=1)

        # Output
        logits = self.fc(self.dropout(context))
        return logits


# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = LSTMAttentionClassifier(
    vocab_size=len(char_to_idx),
    embed_dim=64,
    hidden_dim=128,
    output_dim=num_classes,
    n_layers=2,
    lstm_dropout=0.4,
    fc_dropout=0.3
).to(device)

print(model)


Using device: cuda
LSTMAttentionClassifier(
  (embedding): Embedding(39, 64, padding_idx=0)
  (lstm): LSTM(64, 128, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
  (attn_w1): Linear(in_features=256, out_features=128, bias=True)
  (attn_tanh): Tanh()
  (attn_w2): Linear(in_features=128, out_features=1, bias=True)
  (attn_softmax): Softmax(dim=1)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=51, bias=True)
)


In [12]:
def evaluate(loader, model, device):
    """Calculate accuracy on given data loader"""
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for x, y, lengths in loader:
            x, y, lengths = x.to(device), y.to(device), lengths.to(device)
            preds = model(x, lengths).argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    model.train()
    return 100.0 * correct / total


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=2, factor=0.5)

# Training config
num_epochs = 30
max_grad_norm = 1.0

# Early stopping
early_patience = 20
min_delta = 0.001
best_acc = 0
bad_epochs = 0

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels, lengths in train_loader:
        inputs, labels, lengths = inputs.to(device), labels.to(device), lengths.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, lengths)
        loss = criterion(outputs, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()

        running_loss += loss.item() * labels.size(0)
        correct += (outputs.argmax(1) == labels).sum().item()
        total += labels.size(0)

    train_loss = running_loss / total
    train_acc = 100.0 * correct / total
    test_acc = evaluate(test_loader, model, device)

    scheduler.step(test_acc)
    lr = optimizer.param_groups[0]["lr"]

    print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | Test Acc: {test_acc:.2f}% | LR: {lr:.6f}")

    # Early stopping check
    if test_acc > best_acc + min_delta:
        best_acc = test_acc
        bad_epochs = 0
    else:
        bad_epochs += 1
        if bad_epochs >= early_patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc:.2f}%")
            break

print("Training complete!")

Epoch [1/30] Loss: 1.6406 | Train Acc: 44.72% | Test Acc: 49.45% | LR: 0.000500
Epoch [2/30] Loss: 1.4081 | Train Acc: 49.25% | Test Acc: 49.62% | LR: 0.000500
Epoch [3/30] Loss: 1.3986 | Train Acc: 49.54% | Test Acc: 49.91% | LR: 0.000500
Epoch [4/30] Loss: 1.3974 | Train Acc: 49.59% | Test Acc: 50.05% | LR: 0.000500
Epoch [5/30] Loss: 1.3950 | Train Acc: 49.65% | Test Acc: 50.19% | LR: 0.000500
Epoch [6/30] Loss: 1.3940 | Train Acc: 49.64% | Test Acc: 49.93% | LR: 0.000500
Epoch [7/30] Loss: 1.3934 | Train Acc: 49.61% | Test Acc: 50.01% | LR: 0.000500
Epoch [8/30] Loss: 1.3912 | Train Acc: 49.69% | Test Acc: 50.16% | LR: 0.000250
Epoch [9/30] Loss: 1.3833 | Train Acc: 49.81% | Test Acc: 50.23% | LR: 0.000250
Epoch [10/30] Loss: 1.3842 | Train Acc: 49.79% | Test Acc: 49.91% | LR: 0.000250
Epoch [11/30] Loss: 1.3837 | Train Acc: 49.86% | Test Acc: 49.97% | LR: 0.000250
Epoch [12/30] Loss: 1.3845 | Train Acc: 49.80% | Test Acc: 50.25% | LR: 0.000250
Epoch [13/30] Loss: 1.3841 | Train Ac