In [1]:
import json

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset

In [3]:
# Custom dataset class for our JSON data
class EmbeddingDataset(Dataset):
    def __init__(self, data, label_encoder=None, train=True):
        """
        Initialize the dataset with our JSON records.

        Args:
            data: List of dictionaries containing our records
            label_encoder: Optional pre-fit LabelEncoder for file_paths
            train: Whether this is training data (to fit encoder) or not
        """
        self.embeddings = []
        self.file_paths = []

        # Extract embeddings and file_paths from our data
        for item in data:
            self.embeddings.append(item["embedding"])
            self.file_paths.append(item["file_path"])

        # Convert to numpy arrays for easier processing
        self.embeddings = np.array(self.embeddings, dtype=np.float32)

        # Convert file_paths to numerical labels
        if label_encoder is None:
            self.label_encoder = LabelEncoder()
            self.labels = self.label_encoder.fit_transform(self.file_paths)
        else:
            self.label_encoder = label_encoder
            self.labels = self.label_encoder.transform(self.file_paths)

        # Convert to PyTorch tensors
        self.embeddings = torch.FloatTensor(self.embeddings)
        self.labels = torch.LongTensor(self.labels)

    def __len__(self):
        """Return the number of examples in the dataset"""
        return len(self.embeddings)

    def __getitem__(self, idx):
        """Return a single example from the dataset"""
        return {"embedding": self.embeddings[idx], "label": self.labels[idx]}

    def num_classes(self):
        """Return the number of unique classes"""
        return len(self.label_encoder.classes_)

    def get_label_encoder(self):
        """Return the label encoder for decoding predictions"""
        return self.label_encoder

In [None]:
# Load data from JSON file
# json_file_path = "/home/blais/q/office/documents/.train.json"
json_file_path = "/tmp/train.json"
with open(json_file_path, "r") as f:
    data = json.load(f)

len(data)

In [11]:
train_dataset = EmbeddingDataset(data)
print(len(train_dataset))

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

In [None]:
# Neural network for classification
class EmbeddingClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes, dropout_rate=0.3):
        """
        Initialize the classifier neural network.

        Args:
            embedding_dim: Dimension of the embedding vectors
            hidden_dim: Dimension of the hidden layer
            num_classes: Number of classes to predict
            dropout_rate: Dropout probability
        """
        super(EmbeddingClassifier, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim // 2, num_classes),
        )

    def forward(self, x):
        """Forward pass through the model"""
        return self.model(x)

In [None]:
def train_model(
    model, train_loader, val_loader, criterion, optimizer, device, num_epochs=10
):
    """
    Train the neural network model.

    Args:
        model: The neural network model
        train_loader: DataLoader for training data
        val_loader: DataLoader for validation data
        criterion: Loss function
        optimizer: Optimization algorithm
        device: Device to run training on (CPU/GPU)
        num_epochs: Number of training epochs

    Returns:
        Trained model
    """
    model = model.to(device)
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0

        for batch in train_loader:
            embeddings = batch["embedding"].to(device)
            labels = batch["label"].to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(embeddings)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Statistics
            train_loss += loss.item() * embeddings.size(0)
            _, preds = torch.max(outputs, 1)
            train_correct += torch.sum(preds == labels).item()

        train_loss = train_loss / len(train_loader.dataset)
        train_acc = train_correct / len(train_loader.dataset)

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0

        with torch.no_grad():
            for batch in val_loader:
                embeddings = batch["embedding"].to(device)
                labels = batch["label"].to(device)

                outputs = model(embeddings)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * embeddings.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct += torch.sum(preds == labels).item()

        val_loss = val_loss / len(val_loader.dataset)
        val_acc = val_correct / len(val_loader.dataset)

        print(f"Epoch {epoch + 1}/{num_epochs}:")
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return model

In [None]:
def main(
    json_file_path, batch_size=32, hidden_dim=128, learning_rate=0.001, num_epochs=10
):
    """
    Main function to run the entire training pipeline.

    Args:
        json_file_path: Path to the JSON file with our data
        batch_size: Batch size for training
        hidden_dim: Hidden dimension for the neural network
        learning_rate: Learning rate for optimization
        num_epochs: Number of training epochs
    """
    # Load data from JSON file
    with open(json_file_path, "r") as f:
        data = json.load(f)

    # Split into train and validation sets
    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

    # Create datasets
    train_dataset = EmbeddingDataset(train_data)
    label_encoder = train_dataset.get_label_encoder()
    val_dataset = EmbeddingDataset(val_data, label_encoder=label_encoder, train=False)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Determine embedding dimension from the data
    embedding_dim = train_dataset.embeddings.shape[1]
    num_classes = train_dataset.num_classes()

    print(f"Embedding dimension: {embedding_dim}")
    print(f"Number of classes: {num_classes}")

    # Initialize model, loss function, and optimizer
    model = EmbeddingClassifier(embedding_dim, hidden_dim, num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Determine device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Train the model
    trained_model = train_model(
        model, train_loader, val_loader, criterion, optimizer, device, num_epochs
    )

    # Save the model
    torch.save(
        {
            "model_state_dict": trained_model.state_dict(),
            "label_encoder": label_encoder,
            "embedding_dim": embedding_dim,
            "hidden_dim": hidden_dim,
            "num_classes": num_classes,
        },
        "embedding_classifier.pt",
    )

    print("Model training complete and saved to 'embedding_classifier.pt'")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Train a classifier on embedding data")
    parser.add_argument(
        "json_file", type=str, help="Path to the JSON file with embedding data"
    )
    parser.add_argument(
        "--batch_size", type=int, default=32, help="Batch size for training"
    )
    parser.add_argument(
        "--hidden_dim", type=int, default=128, help="Hidden dimension size"
    )
    parser.add_argument("--lr", type=float, default=0.001, help="Learning rate")
    parser.add_argument(
        "--epochs", type=int, default=10, help="Number of training epochs"
    )

    args = parser.parse_args()

    main(
        args.json_file,
        batch_size=args.batch_size,
        hidden_dim=args.hidden_dim,
        learning_rate=args.lr,
        num_epochs=args.epochs,
    )

_StoreAction(option_strings=[], dest='json_file', nargs=None, const=None, default=None, type=<class 'str'>, choices=None, required=True, help='Path to the JSON file with embedding data', metavar=None)

_StoreAction(option_strings=['--batch_size'], dest='batch_size', nargs=None, const=None, default=32, type=<class 'int'>, choices=None, required=False, help='Batch size for training', metavar=None)

_StoreAction(option_strings=['--hidden_dim'], dest='hidden_dim', nargs=None, const=None, default=128, type=<class 'int'>, choices=None, required=False, help='Hidden dimension size', metavar=None)

_StoreAction(option_strings=['--lr'], dest='lr', nargs=None, const=None, default=0.001, type=<class 'float'>, choices=None, required=False, help='Learning rate', metavar=None)

_StoreAction(option_strings=['--epochs'], dest='epochs', nargs=None, const=None, default=10, type=<class 'int'>, choices=None, required=False, help='Number of training epochs', metavar=None)

usage: ipykernel_launcher.py [-h] [--batch_size BATCH_SIZE]
                             [--hidden_dim HIDDEN_DIM] [--lr LR]
                             [--epochs EPOCHS]
                             json_file
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
