In [2]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoConfig, AutoTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your data
data = pd.read_csv('segmented_data_statistics.csv')

# Encode labels
data['Label'] = data['Filename'].apply(lambda x: 1 if 'H' in x else 0)

# Split the data
X = data.drop(columns=['Filename', 'Label'])
y = data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to tensors
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_test = torch.tensor(y_test.values, dtype=torch.long)

# Define a simple Transformer model for classification
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(TransformerClassifier, self).__init__()
        
        # Define the embedding layer to project input features to a higher dimension
        self.embedding = nn.Linear(input_dim, 512)
        
        # Define the Transformer encoder layer
        encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
        
        # Fully connected layer for classification
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # Project input features
        x = x.unsqueeze(1)  # Add a dimension for the sequence (batch_size, sequence_length, embedding_dim)
        x = self.transformer(x)
        x = x.mean(dim=1)  # Global average pooling across the sequence length
        x = self.fc(x)
        return x

# Instantiate the model
model = TransformerClassifier(input_dim=X_train.shape[1], num_classes=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
def train_model(model, X_train, y_train, X_test, y_test, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train)
        loss = criterion(output, y_train)
        loss.backward()
        optimizer.step()

        # Evaluate on the test set
        model.eval()
        with torch.no_grad():
            test_output = model(X_test)
            test_loss = criterion(test_output, y_test)
            _, predicted = torch.max(test_output, 1)
            accuracy = (predicted == y_test).sum().item() / len(y_test)
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}, Test Loss: {test_loss.item()}, Test Accuracy: {accuracy*100:.2f}%')

train_model(model, X_train, y_train, X_test, y_test, criterion, optimizer, epochs=5)




RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3422622848 bytes.