<h4> AA, SS --> 1-9</h4>

In [1]:
with open("sekw.txt", "r") as f:
    sekwencje = [linia.strip() for linia in f]

with open("drugo.txt", "r") as f:
    drugorzedowe = [linia.strip() for linia in f]

with open("wart.txt", "r") as f:
    wartosci = [[int(c) for c in linia.strip()] for linia in f]

In [2]:
raw_data = []
for i in range(557): #poczatek danych
    raw_data.append((sekwencje[i], drugorzedowe[i], wartosci[i]))

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader, random_split



# =========================
# 1. Dane
# =========================

vocab = list(set("".join(sekwencje)))+list(set("".join(drugorzedowe)))
char2idx = {c: i for i, c in enumerate(vocab)}
vocab_size = len(vocab)
num_classes = 10 #0-9
PAD_IDX = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def encode(seq):
    return torch.tensor([char2idx[c] for c in seq], dtype=torch.long)

class SequenceDataset(Dataset):
    def __init__(self, data):
        self.samples = []
        for s1, s2, labels in data:
            self.samples.append((
                encode(s1),
                encode(s2),
                torch.tensor(labels)
            ))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

dataset = SequenceDataset(raw_data)

# =========================
# 2. Podział train/val/test
# =========================

total_size = len(dataset)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset,
    [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

# =========================
# 3. Collate z paddingiem
# =========================

def collate_fn(batch):
    seq1_list, seq2_list, labels_list = zip(*batch)
    lengths = torch.tensor([len(s) for s in seq1_list])
    
    seq1_padded = pad_sequence(seq1_list, batch_first=True, padding_value=PAD_IDX)
    seq2_padded = pad_sequence(seq2_list, batch_first=True, padding_value=PAD_IDX)
    labels_padded = pad_sequence(labels_list, batch_first=True, padding_value=-100)
    
    return seq1_padded, seq2_padded, labels_padded, lengths

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# =========================
# 4. Model
# =========================

class AdvancedDualRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        
        self.rnn = nn.GRU(
            embed_dim*2,
            hidden_dim,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )
        
        self.layer_norm = nn.LayerNorm(hidden_dim*2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, seq1, seq2, lengths):
        emb1 = self.embedding(seq1)
        emb2 = self.embedding(seq2)
        x = torch.cat([emb1, emb2], dim=-1)
        
        packed = pack_padded_sequence(
            x, lengths.cpu(),
            batch_first=True,
            enforce_sorted=False
        )
        
        packed_out, _ = self.rnn(packed)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)
        
        out = self.layer_norm(out)
        out = self.dropout(out)
        
        return self.fc(out)

model = AdvancedDualRNN(vocab_size, 32, 64, num_classes).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# =========================
# 5. Funkcja ewaluacji
# =========================

def evaluate(loader):
    model.eval()
    total_loss = 0
    correct = 0
    total_tokens = 0
    
    with torch.no_grad():
        for seq1, seq2, labels, lengths in loader:
            
            seq1 = seq1.to(device)
            seq2 = seq2.to(device)
            labels = labels.to(device)
            lengths = lengths.to(device)
            
            outputs = model(seq1, seq2, lengths)
            
            loss = criterion(
                outputs.view(-1, num_classes),
                labels.view(-1)
            )
            
            total_loss += loss.item()
            
            preds = torch.argmax(outputs, dim=-1)
            mask = labels != -100
            
            correct += (preds[mask] == labels[mask]).sum().item()
            total_tokens += mask.sum().item()
    
    return total_loss / len(loader), correct / total_tokens

In [4]:
test_loss, test_acc = evaluate(test_loader)

print("\n==== TEST RESULT ====")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")


==== TEST RESULT ====
Test Loss: 2.6932
Test Accuracy: 0.0209


In [5]:
# =========================
# 6. Trening + walidacja
# =========================

epochs = 20

for epoch in range(epochs):
    
    model.train()
    train_loss = 0
    
    for seq1, seq2, labels, lengths in train_loader:
        
        seq1 = seq1.to(device)
        seq2 = seq2.to(device)
        labels = labels.to(device)
        lengths = lengths.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(seq1, seq2, lengths)
        
        loss = criterion(
            outputs.view(-1, num_classes),
            labels.view(-1)
        )
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    train_loss /= len(train_loader)
    
    val_loss, val_acc = evaluate(val_loader)
    
    print(f"Epoch {epoch+1}/{epochs} | "
          f"Train Loss: {train_loss:.4f} | "
          f"Val Loss: {val_loss:.4f} | "
          f"Val Acc: {val_acc:.4f}")

# =========================
# 7. Test końcowy
# =========================

test_loss, test_acc = evaluate(test_loader)

print("\n==== TEST RESULT ====")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/20 | Train Loss: 1.3888 | Val Loss: 0.8941 | Val Acc: 0.5443
Epoch 2/20 | Train Loss: 0.9569 | Val Loss: 0.8625 | Val Acc: 0.5492
Epoch 3/20 | Train Loss: 0.9264 | Val Loss: 0.8354 | Val Acc: 0.5971
Epoch 4/20 | Train Loss: 0.8948 | Val Loss: 0.8696 | Val Acc: 0.5574
Epoch 5/20 | Train Loss: 0.8762 | Val Loss: 0.8155 | Val Acc: 0.6118
Epoch 6/20 | Train Loss: 0.8393 | Val Loss: 0.8272 | Val Acc: 0.6040
Epoch 7/20 | Train Loss: 0.8142 | Val Loss: 0.7956 | Val Acc: 0.6281
Epoch 8/20 | Train Loss: 0.7729 | Val Loss: 0.7733 | Val Acc: 0.6401
Epoch 9/20 | Train Loss: 0.7508 | Val Loss: 0.7940 | Val Acc: 0.6335
Epoch 10/20 | Train Loss: 0.7138 | Val Loss: 0.7576 | Val Acc: 0.6586
Epoch 11/20 | Train Loss: 0.6744 | Val Loss: 0.7615 | Val Acc: 0.6600
Epoch 12/20 | Train Loss: 0.6617 | Val Loss: 0.7648 | Val Acc: 0.6605
Epoch 13/20 | Train Loss: 0.6326 | Val Loss: 0.7985 | Val Acc: 0.6624
Epoch 14/20 | Train Loss: 0.6115 | Val Loss: 0.7578 | Val Acc: 0.6721
Epoch 15/20 | Train Loss: 0.5

In [6]:
idx2char = {v: k for k, v in char2idx.items()}

def letter_accuracy(loader):
    model.eval()
    
    char_stats = defaultdict(lambda: {"correct": 0, "total": 0})
    
    with torch.no_grad():
        for seq1, seq2, labels, lengths in loader:
            
            seq1 = seq1.to(device)
            seq2 = seq2.to(device)
            labels = labels.to(device)
            lengths = lengths.to(device)
            
            outputs = model(seq1, seq2, lengths)
            preds = torch.argmax(outputs, dim=-1)
            
            for b in range(seq1.size(0)):          # po batchu
                length = lengths[b].item()
                
                for i in range(length):           # bez paddingu
                    
                    char_idx = seq1[b, i].item()
                    char = idx2char[char_idx]
                    
                    correct = preds[b, i].item() == labels[b, i].item()
                    
                    char_stats[char]["total"] += 1
                    if correct:
                        char_stats[char]["correct"] += 1
    
    # liczymy accuracy
    results = {}
    for char, stats in char_stats.items():
        results[char] = stats["correct"] / stats["total"]
    
    return results

test_letter_acc = letter_accuracy(test_loader)
print(test_letter_acc)

{'G': 0.6606822262118492, 'A': 0.5659340659340659, 'S': 0.7649484536082474, 'R': 0.8248847926267281, 'L': 0.5804347826086956, 'I': 0.6245847176079734, 'M': 1.0, 'P': 0.6995192307692307, 'K': 0.5384615384615384, 'T': 0.6966824644549763, 'V': 0.5940860215053764, 'H': 0.6877828054298643, 'F': 0.7612456747404844, 'E': 0.5882352941176471, 'D': 0.7645429362880887, 'N': 0.6988636363636364, 'Q': 0.7947019867549668, 'Y': 0.6883116883116883, 'W': 1.0, 'C': 0.692063492063492}


In [7]:
torch.save(model.state_dict(), "GRU_aa_SS.pth")