**BEST Statistics:**

**Classification Report:**

              precision    recall  f1-score   support

           0       0.58      0.58      0.58      6000
           1       0.66      0.80      0.73      6000
           2       0.75      0.78      0.77      6000
           3       0.55      0.49      0.52      6000
           4       0.82      0.87      0.84      6000
           5       0.88      0.83      0.85      6000
           6       0.67      0.45      0.54      6000
           7       0.69      0.68      0.68      6000
           8       0.63      0.83      0.72      6000
           9       0.81      0.69      0.74      6000
    
**total dataset size** = 60000

**accuracy** = 0.70    
 
**macro avg**  
**precision** = 0.70      
**recall** = 0.70      
**f1-score** = 0.70

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from transformers import GPT2Config
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import torch
import torch.nn as nn
import math
import transformers

def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    param_size = 4  # Assuming float32 (4 bytes per parameter)
    total_size = total_params * param_size
    return total_size / (1024 ** 2)  # Convert bytes to MB


def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

class PretrainedGPTEmbedding(nn.Module):
    def __init__(self, model_name='gpt2', d_model=512, freeze_embeddings=True):
        super(PretrainedGPTEmbedding, self).__init__()
        
        # Load pretrained GPT model
        self.gpt_model = transformers.GPT2Model.from_pretrained(model_name)
        
        # Freeze embeddings if specified
        if freeze_embeddings:
            for param in self.gpt_model.wte.parameters():
                param.requires_grad = False
            for param in self.gpt_model.wpe.parameters():
                param.requires_grad = False
        
        # Embedding dimensions and vocab size from the pretrained model
        self.d_model = self.gpt_model.config.n_embd
        self.vocab_size = self.gpt_model.config.vocab_size
        self.linear = nn.Linear(768, d_model)
    
    def forward(self, input_ids, attention_mask=None):
        # Get embeddings directly from pretrained GPT model
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        
        # Use the pretrained model's embedding method
        outputs = self.gpt_model.wte(input_ids) + self.gpt_model.wpe(
            torch.arange(input_ids.size(1), device=input_ids.device).unsqueeze(0).repeat(input_ids.size(0), 1)
        )
        
        return self.linear(outputs)
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Compute positional encodings
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # Linear projections
        Q = self.query(query)
        K = self.key(key)
        V = self.value(value)
        
        # Split into multiple heads
        Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        attention = torch.softmax(scores, dim=-1)
        out = torch.matmul(attention, V)
        
        # Concatenate heads and project
        out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.fc_out(out)

class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerDecoderLayer, self).__init__()
        
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, memory, src_mask=None, tgt_mask=None):
        # Self-attention
        attn_output = self.self_attn(x, x, x, mask=tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Cross-attention with encoder memory
        cross_attn_output = self.cross_attn(x, memory, memory, mask=src_mask)
        x = self.norm2(x + self.dropout(cross_attn_output))
        
        # Feed-forward network
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, num_classes, d_model=d_model, num_heads=num_heads, d_ff=d_ff, num_layers=2, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        
        self.embedding = PretrainedGPTEmbedding('gpt2', d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        self.fc_out = nn.Linear(d_model, num_classes)
        self.dropout = nn.Dropout(dropout)
        self.loss = nn.CrossEntropyLoss()
    
    def generate_mask(self, seq_len):
        # Create a causal mask to prevent attending to future tokens
        mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0)
        return mask
    
    def forward(self, tgt, label=None, infer=False):
        # Embedding and positional encoding
        memory = torch.zeros(batch_size, seq_len, d_model).to(tgt.device)  # Encoder output

        x = self.dropout(self.pos_encoder(self.embedding(tgt)))
        
        # Create causal mask
        tgt_mask = self.generate_mask(tgt.size(1)).to(tgt.device)
        
        # Pass through decoder layers
        for layer in self.layers:
            x = layer(x, memory)
        
        x = torch.mean(x, dim=1)
        x = self.fc_out(x)
        if infer:
            return x
        # Final projection to vocabulary size
        return self.loss(x, label)

class YahooDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Combine text fields and clean up
        self.texts = (dataframe['question_title'].fillna('') + ' [SEP] ' + 
                     dataframe['question_content'].fillna('') + ' [SEP] ' + 
                     dataframe['best_answer'].fillna(''))
        
        # Convert to zero-based indexing
        self.labels = dataframe['class'] - 1
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }
    
    def __len__(self):
        return len(self.texts)

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc='Training')
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        label = batch['label'].to(device)
        
        loss = model(
            input_ids,
            label=label
        )
        
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    true_labels = []
    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids,
                infer=True
            )
            
            preds = torch.argmax(outputs, dim=1)
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(preds.cpu().numpy())
    
    return classification_report(true_labels, predictions, zero_division=0)

# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load datasets
print('Loading datasets...')
train_df = pd.read_csv('train.csv', 
                        names=['class', 'question_title', 'question_content', 'best_answer'])

# Sample 10,000 examples per class for balanced training
samples_per_class = 10000
sampled_train_df = []

for class_idx in range(1, 11):  # 10 classes
    class_data = train_df[train_df['class'] == class_idx]
    sampled_class = class_data.sample(n=min(samples_per_class, len(class_data)), 
                                    random_state=42)
    sampled_train_df.append(sampled_class)

train_df = pd.concat(sampled_train_df, ignore_index=True)
print(f'Training with {len(train_df)} examples')

# Load test data
test_df = pd.read_csv('test.csv', 
                        names=['class', 'question_title', 'question_content', 'best_answer'])

# Load classes
with open('classes.txt', 'r') as f:
    class_names = [line.strip() for line in f.readlines()]

# Initialize tokenizer
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Create datasets
train_dataset = YahooDataset(train_df, tokenizer)
test_dataset = YahooDataset(test_df, tokenizer)

# Create dataloaders
batch_size = 32  # Can use larger batch size with smaller model
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    sampler=RandomSampler(train_dataset)
)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


Using device: cuda:0
Loading datasets...
Training with 100000 examples
Loading tokenizer...


: 

In [None]:

d_model = 512
num_heads = 8
d_ff = 2048
seq_len = 1024
batch_size = 32
num_layers=3

# Create decoder
model = TransformerDecoder(10, d_model = d_model, num_heads = num_heads, d_ff = d_ff, num_layers=num_layers).to(device)

total_params, trainable_params = count_parameters(model)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

size_in_mb = get_model_size(model)
print(f"Model size: {size_in_mb:.2f} MB")
model.train()

# Training settings
epochs = 30
optimizer = AdamW(model.parameters(), lr=5e-5)  # Slightly higher learning rate
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training loop
print('Starting training...')
best_accuracy = 0

for epoch in range(epochs):
    print(f'\nEpoch {epoch + 1}/{epochs}')
    
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f'Average training loss: {train_loss:.4f}')
    
    print('\nEvaluating...')
    report = evaluate(model, test_loader, device)
    print('\nClassification Report:')
    print(report)
    
    # Save model if it improves
    accuracy = float(report.split('\n')[-2].split()[-2])
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'best_yahoo_tf_2layer.pt')
        print(f'Saved best model with accuracy: {accuracy:.4f}')


Total parameters: 137450762
Trainable parameters: 98066954
Model size: 524.33 MB
Starting training...

Epoch 1/30


Training: 100%|██████████| 3125/3125 [10:57<00:00,  4.75it/s, loss=1.0235]


Average training loss: 1.7952

Evaluating...


Evaluating: 100%|██████████| 1875/1875 [04:32<00:00,  6.89it/s]



Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.46      0.53      6000
           1       0.67      0.74      0.70      6000
           2       0.68      0.79      0.73      6000
           3       0.55      0.36      0.44      6000
           4       0.82      0.81      0.82      6000
           5       0.78      0.83      0.80      6000
           6       0.54      0.46      0.50      6000
           7       0.60      0.64      0.62      6000
           8       0.67      0.72      0.70      6000
           9       0.64      0.80      0.71      6000

    accuracy                           0.66     60000
   macro avg       0.66      0.66      0.65     60000
weighted avg       0.66      0.66      0.65     60000

Saved best model with accuracy: 0.6500

Epoch 2/30


Training: 100%|██████████| 3125/3125 [17:31<00:00,  2.97it/s, loss=0.9488]


Average training loss: 1.0409

Evaluating...


Evaluating: 100%|██████████| 1875/1875 [04:39<00:00,  6.72it/s]



Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.52      0.54      6000
           1       0.72      0.68      0.70      6000
           2       0.67      0.83      0.75      6000
           3       0.47      0.53      0.50      6000
           4       0.83      0.82      0.83      6000
           5       0.84      0.83      0.83      6000
           6       0.59      0.50      0.54      6000
           7       0.74      0.59      0.65      6000
           8       0.70      0.73      0.71      6000
           9       0.67      0.78      0.72      6000

    accuracy                           0.68     60000
   macro avg       0.68      0.68      0.68     60000
weighted avg       0.68      0.68      0.68     60000

Saved best model with accuracy: 0.6800

Epoch 3/30


Training: 100%|██████████| 3125/3125 [17:35<00:00,  2.96it/s, loss=1.2348]


Average training loss: 0.9740

Evaluating...


Evaluating:  16%|█▌        | 302/1875 [00:45<04:00,  6.53it/s]

In [None]:

def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    param_size = 4  # Assuming float32 (4 bytes per parameter)
    total_size = total_params * param_size
    return total_size / (1024 ** 2)  # Convert bytes to MB


d_model = 512
num_heads = 8
d_ff = 128
seq_len = 1024
batch_size = 32

# Create decoder
model = TransformerDecoder(10).to(device)


size_in_mb = get_model_size(model)
print(f"Model size: {size_in_mb:.2f} MB")

In [None]:
d_model = 512
num_heads = 8
d_ff = 512
seq_len = 1024
batch_size = 32

# Create decoder
model = TransformerDecoder(10).to(device)

model_path = 'best_yahoo_gpt2_2layer.pt'

# Load the state dict
state_dict = torch.load(model_path, map_location='cpu')

# Load the state dictionary into the model
model.load_state_dict(state_dict)

# Set the model to evaluation mode
model.eval()

report = evaluate(model, test_loader, device)
print('\nClassification Report:')
print(report)

Evaluating: 100%|██████████| 1875/1875 [01:57<00:00, 15.90it/s]


Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.58      0.58      6000
           1       0.66      0.80      0.73      6000
           2       0.75      0.78      0.77      6000
           3       0.55      0.49      0.52      6000
           4       0.82      0.87      0.84      6000
           5       0.88      0.83      0.85      6000
           6       0.67      0.45      0.54      6000
           7       0.69      0.68      0.68      6000
           8       0.63      0.83      0.72      6000
           9       0.81      0.69      0.74      6000

    accuracy                           0.70     60000
   macro avg       0.70      0.70      0.70     60000
weighted avg       0.70      0.70      0.70     60000




