In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import json
from sklearn.metrics import accuracy_score

In [7]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_classes, num_layers=1, num_heads=2, hidden_size=64, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embedding_size, num_heads, hidden_size, dropout),
            num_layers
        )
        self.fc = nn.Linear(embedding_size, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(1, 0, 2)  # Change dimensions for transformer
        output = self.transformer(embedded)
        output = output.mean(dim=0)  # Average across time steps
        output = self.fc(output)
        return output

In [8]:
class KeyboardDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def _pad_data(self):
        max_keypresses = max([len])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [9]:
vocab_size = 113  # Assuming ASCII characters
embedding_size = 64
num_classes = 5
num_layers = 2
num_heads = 2
hidden_size = 128
dropout = 0.1
learning_rate = 0.001
batch_size = 32
epochs = 10

In [18]:
# labels key:
# 0: Aidan
# 1: Srujan
# 2: Eric
# 3: Tony

file_prefix = '../'
datapoints_per_person = 1000

fh = open(f'{file_prefix}aidan_final_data_overlapping.json', 'r')
aidan_data = json.load(fh)[:datapoints_per_person]

fh = open(f'{file_prefix}srujan_final_data_overlapping.json', 'r')
srujan_data = json.load(fh)[:datapoints_per_person]

fh = open(f'{file_prefix}eric_final_data_overlapping.json', 'r')
eric_data = json.load(fh)[:datapoints_per_person]

fh = open(f'{file_prefix}tony_final_data_overlapping.json', 'r')
tony_data = json.load(fh)[:datapoints_per_person]

data = aidan_data + srujan_data + eric_data + tony_data
labels = ([0] * datapoints_per_person) + ([1] * datapoints_per_person) + ([2] * datapoints_per_person) + ([3] * datapoints_per_person)

print(f'There are {len(data)} 5-second intervals, {datapoints_per_person} intervals from each person')

# Split data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(data, labels, test_size=0.2, random_state=42)

# Define dataset and dataloaders
train_dataset = KeyboardDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = KeyboardDataset(val_data, val_labels)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize model, loss function, and optimizer
model = TransformerModel(vocab_size, embedding_size, num_classes, num_layers, num_heads, hidden_size, dropout)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


There are 4000 5-second intervals, 1000 intervals from each person




In [19]:
# Training loop
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        # inputs = torch.tensor([[ord(key[1]) for key in sequence] for sequence in inputs])  # Convert keys to ASCII values
        inputs = inputs.to(torch.int64)
        labels = torch.tensor(labels)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    val_predictions = []
    val_targets = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            # inputs = torch.tensor([[ord(key[1]) for key in sequence] for sequence in inputs])  # Convert keys to ASCII values
            inputs = inputs.to(torch.int64)
            labels = torch.tensor(labels)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            val_predictions.extend(predicted.cpu().numpy())
            val_targets.extend(labels.cpu().numpy())
    
    val_accuracy = accuracy_score(val_targets, val_predictions)
    print(f'Epoch [{epoch+1}/{epochs}], Validation Accuracy: {val_accuracy:.4f}')


RuntimeError: each element in list of batch should be of equal size

In [None]:
test_data, test_labels = generate_sample_data()
test_inputs = torch.tensor([[ord(key[1]) for key in sequence] for sequence in test_data])
test_inputs = test_inputs.to(torch.int64)
test_labels = torch.tensor(test_labels)
with torch.no_grad():
    model.eval()
    outputs = model(test_inputs)
    _, predicted = torch.max(outputs, 1)
    test_accuracy = accuracy_score(test_labels.numpy(), predicted.cpu().numpy())
    print(f'Test Accuracy: {test_accuracy:.4f}')