In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
tokenizer = nlp.tokenizer

In [2]:
# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, source_texts, summary_texts, vocab):
        self.source_texts = source_texts
        self.summary_texts = summary_texts
        self.vocab = vocab

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source_text = self.source_texts[idx]
        summary_text = self.summary_texts[idx]
        source_tensor = torch.tensor([self.vocab.get(token, 0) for token in source_text], dtype=torch.long)
        summary_tensor = torch.tensor([self.vocab.get(token, 0) for token in summary_text], dtype=torch.long)
        return source_tensor, summary_tensor



# This part was taken from another notebook uses the same dataset
## Just readding the data

In [3]:
def create_dataframe(source_text_path, target_text_path):
    txt_files_source = [file for file in os.listdir(source_text_path) if file.endswith('.txt')]
    txt_files_target = [file for file in os.listdir(target_text_path) if file.endswith('.txt')]
    df = pd.DataFrame(columns=['headlines', 'text'])
    for source, target in zip(txt_files_source, txt_files_target):
        assert source == target
        source_file_path = os.path.join(source_text_path, source)
        target_file_path = os.path.join(target_text_path, target)
        with open(source_file_path, 'r', encoding='latin-1') as file:
            source_text = file.read()
        with open(target_file_path, 'r', encoding='latin-1') as file:
            target_text = file.read()
        df.loc[len(df.index)] = [source_text, target_text]
    return df



# Trying to solve Craching problem!
# and protect my self from out of memory problems

In [4]:
def save_checkpoint(state, filename='weights.pth.tar'):
    print('Saving weights-->')
    torch.save(state, filename)

def load_checkpoint(checkpoint, model, optim):
    print('Loading weights-->')
    model.load_state_dict(checkpoint['state_dict'])
    optim.load_state_dict(checkpoint['optimizer'])

# Just start with one dataframe for now
## it is not big but it will work fine

In [5]:
# Load datasets
df1 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/business", 
                       "/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/business")
df2 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/entertainment", 
                       "/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/entertainment")
df3 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/politics", 
                       "/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/politics")
df4 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/sport", 
                      "/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/sport")
df5 = create_dataframe("/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/tech", 
                      "/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/tech")

# Combine data into a single DataFrame
df = pd.concat([df1,df2,df3,df4],ignore_index=True)



# Data Cleanning
### Just copied these words from teh internet

In [6]:
contraction_mapping = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot",
    "'cause": "because", "could've": "could have", "couldn't": "could not",
    "didn't": "did not", "doesn't": "does not", "don't": "do not",
    "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'll": "he will", "he's": "he is",
    "how'd": "how did", "how'll": "how will", "how's": "how is",
    "I'd": "I would", "I'll": "I will", "I'm": "I am",
    "isn't": "is not", "it'd": "it would", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mightn't": "might not",
    "mustn't": "must not", "needn't": "need not", "shan't": "shall not",
    "she'd": "she would", "she's": "she is", "shouldn't": "should not",
    "that's": "that is", "there's": "there is", "they're": "they are",
    "we're": "we are", "won't": "will not", "wouldn't": "would not",
    "y'all": "you all", "you're": "you are"
}


stop_words = set([
    "a", "about", "above", "after", "again", "against", "all", "am", "an",
    "and", "any", "are", "aren't", "as", "at", "be", "because", "been", 
    "before", "being", "below", "between", "both", "but", "by", "can't", 
    "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", 
    "doing", "don't", "down", "during", "each", "few", "for", "from", "further", 
    "had", "hadn't", "has", "hasn't", "have", "haven't", "he", "he'd", 
    "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", 
    "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", 
    "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", 
    "just", "ll", "m", "ma", "me", "might", "mightn't", "more", "most", 
    "must", "mustn't", "my", "myself", "need", "needn't", "no", "nor", 
    "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", 
    "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", 
    "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", 
    "so", "some", "such", "t", "than", "that", "that's", "the", "their", 
    "theirs", "them", "themselves", "then", "there", "there's", "these", 
    "they", "they'd", "they'll", "they're", "they've", "this", "those", 
    "through", "to", "too", "under", "until", "up", "ve", "very", "was", 
    "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", 
    "what", "what's", "when", "where", "where's", "which", "while", "who", 
    "who's", "whom", "why", "why's", "will", "with", "won't", "would", 
    "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've", "your", 
    "yours", "yourself", "yourselves"
])


# the same Cleanner I used in the privious projects

In [7]:
def text_cleaner(text):
    newString = text.lower()
    newString = newString.replace('"', "'")
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"', '', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b", "", newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = [w for w in newString.split() if not w in stop_words]
    return " ".join(tokens)



# Train-Test Split


In [8]:
# Split into train and test sets
df = df.rename(columns={"headlines": "source_text", "text": "summary_text"})
X, Y = df["source_text"], df["summary_text"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
train_df = pd.DataFrame({'source_text': X_train, 'summary_text': Y_train})
test_df = pd.DataFrame({'source_text': X_test, 'summary_text': Y_test})

# Preprocess the DataFrame
train_df['source_text'] = train_df['source_text'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])
train_df['summary_text'] = train_df['summary_text'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])
test_df['source_text'] = test_df['source_text'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])
test_df['summary_text'] = test_df['summary_text'].apply(lambda x: [token.text.lower() for token in tokenizer(text_cleaner(x))])


### Create my vocabulary from the training dataset that maps each token to a unique integer index.

In [9]:
# Build vocabularies
all_tokens = train_df['source_text'].tolist() + train_df['summary_text'].tolist()
vocab = {word: idx for idx, word in enumerate(set(token for tokens in all_tokens for token in tokens))}

# Multi-Head Attention Layer
### this is the multi-head attention mechanism used in the Transformer architecture. Nothing new in this code

In [10]:
# Define the Multi-Head Attention layer
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.depth = d_model // n_heads
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)
        self.dense = nn.Linear(d_model, d_model)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.n_heads, self.depth)
        return x.permute(0, 2, 1, 3)  # (batch_size, n_heads, seq_length, depth)

    def forward(self, query, key, value):
        batch_size = query.size(0)
        query = self.split_heads(self.wq(query), batch_size)
        key = self.split_heads(self.wk(key), batch_size)
        value = self.split_heads(self.wv(value), batch_size)

        # Scaled Dot-Product Attention
        matmul_qk = torch.matmul(query, key.transpose(-1, -2))
        dk = query.size(-1)
        scaled_attention_logits = matmul_qk / (dk ** 0.5)
        attention_weights = F.softmax(scaled_attention_logits, dim=-1)

        output = torch.matmul(attention_weights, value)
        output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
        return self.dense(output)



# I will use Transformer including the encoder and decoder layers.


In [11]:
# Define the Transformer model
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder = nn.Embedding(vocab_size, d_model)
        self.decoder = nn.Embedding(vocab_size, d_model)
        self.attention = MultiHeadAttention(d_model, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.dropout = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.n_layers = n_layers
        
        # Add a linear layer to project to vocab size
        self.output_layer = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        # Encoder
        enc_output = self.encoder(src)
        enc_output = self.dropout(enc_output)

        # Decoder
        dec_output = self.decoder(tgt)
        dec_output = self.dropout(dec_output)

        for _ in range(self.n_layers):
            attn_output = self.attention(dec_output, enc_output, enc_output)
            dec_output = self.norm1(dec_output + attn_output)
            ffn_output = self.ffn(dec_output)
            dec_output = self.norm2(dec_output + ffn_output)

        # Project the output to vocab size
        output = self.output_layer(dec_output)  # Now this has shape [batch_size, seq_len, vocab_size]
        
        return output


## Just tring to solve the length problem

In [12]:
# Pad source and target batches
def collate_fn(batch):
    source_batch, target_batch = zip(*batch)
    source_batch = torch.nn.utils.rnn.pad_sequence(source_batch, batch_first=True, padding_value=0)
    target_batch = torch.nn.utils.rnn.pad_sequence(target_batch, batch_first=True, padding_value=0)
    return source_batch, target_batch


# Note do not change the Hyperparameters espicialy the bach size as this chan exceed the memory and may crash

In [13]:
# This is my Hyperparameters
batch_size = 16
d_model = 256
num_heads = 8
num_layers = 6
d_ff = 512
max_seq_length = 100
dropout = 0.1
learning_rate = 0.001
num_epochs = 500

### Just to make sure Kaggel works on the GPU

In [14]:
# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using", device)

Using cuda


## Creat the dataset and Initialize my transformer

In [15]:
# Create Dataset and DataLoader
train_dataset = CustomDataset(train_df['source_text'].tolist(), train_df['summary_text'].tolist(), vocab)
test_dataset = CustomDataset(test_df['source_text'].tolist(), test_df['summary_text'].tolist(), vocab)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Initialize the model
model = Transformer(len(vocab), d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

accumulation_steps = 2

In [16]:
print(model)

Transformer(
  (encoder): Embedding(21936, 256)
  (decoder): Embedding(21936, 256)
  (attention): MultiHeadAttention(
    (wq): Linear(in_features=256, out_features=256, bias=True)
    (wk): Linear(in_features=256, out_features=256, bias=True)
    (wv): Linear(in_features=256, out_features=256, bias=True)
    (dense): Linear(in_features=256, out_features=256, bias=True)
  )
  (ffn): Sequential(
    (0): Linear(in_features=256, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (output_layer): Linear(in_features=256, out_features=21936, bias=True)
)


## Training Loop


In [17]:
def check_accuracy(output, labels):
    _, predpos = output.max(1)
    num_samples = len(labels)
    num_correct = (predpos == labels).sum()
    return (num_correct / num_samples) * 100

In [18]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_accuracy = 0
    num_batches = 0

    for i, (src, tgt) in enumerate(train_loader):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        output = model(src, tgt[:, :-1])
        loss = F.cross_entropy(output.view(-1, len(vocab)), tgt[:, 1:].reshape(-1))
        loss = loss / accumulation_steps  # Scale the loss
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()

        total_loss += loss.item()
        num_batches += 1

        # Calc. accuracy
        accuracy = check_accuracy(output.view(-1, len(vocab)), tgt[:, 1:].reshape(-1))
        total_accuracy += accuracy

        # Print progress every 10 batches
        if (i + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%")

    avg_loss = total_loss / num_batches
    avg_accuracy = total_accuracy / num_batches
    print(f"Epoch [{epoch + 1}/{num_epochs}] Average Loss: {avg_loss:.4f}, Average Accuracy: {avg_accuracy:.2f}%")

    # Save model checkpoint at the end of each epoch
    # Just to avoid the close of the terminal in kaggel
    save_checkpoint({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()})


Epoch [1/500], Step [10/92], Loss: 2.1850, Accuracy: 63.29%
Epoch [1/500], Step [20/92], Loss: 2.8270, Accuracy: 44.47%
Epoch [1/500], Step [30/92], Loss: 2.4604, Accuracy: 46.42%
Epoch [1/500], Step [40/92], Loss: 2.2616, Accuracy: 49.17%
Epoch [1/500], Step [50/92], Loss: 2.0914, Accuracy: 51.28%
Epoch [1/500], Step [60/92], Loss: 2.2665, Accuracy: 49.05%
Epoch [1/500], Step [70/92], Loss: 2.7724, Accuracy: 37.62%
Epoch [1/500], Step [80/92], Loss: 2.6285, Accuracy: 39.11%
Epoch [1/500], Step [90/92], Loss: 2.8932, Accuracy: 34.28%
Epoch [1/500] Average Loss: 2.3905, Average Accuracy: 49.06%
Saving weights-->
Epoch [2/500], Step [10/92], Loss: 1.9835, Accuracy: 53.70%
Epoch [2/500], Step [20/92], Loss: 1.1010, Accuracy: 74.60%
Epoch [2/500], Step [30/92], Loss: 1.9590, Accuracy: 53.93%
Epoch [2/500], Step [40/92], Loss: 2.4101, Accuracy: 44.36%
Epoch [2/500], Step [50/92], Loss: 2.3423, Accuracy: 46.27%
Epoch [2/500], Step [60/92], Loss: 2.2965, Accuracy: 46.79%
Epoch [2/500], Step [

In [19]:
save_checkpoint({'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()})

Saving weights-->


## NOW lets see the performance on the data loader

In [20]:
def evaluate_model(model, data_loader, vocab):
    model.eval()
    total_accuracy = 0
    num_batches = 0
    
    #close the grad to avoid crach and accumilation of memory
    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            
            accuracy = check_accuracy(output.view(-1, len(vocab)), tgt[:, 1:].reshape(-1))
            total_accuracy += accuracy
            num_batches += 1
    
    # Avg. accuracy
    return total_accuracy / num_batches



## And of course this is not the best test accuarcy but it is fine for my epochs and dataset size used and of couse my architecture

In [21]:
# Evaluate on the test set
test_accuracy = evaluate_model(model, test_loader, vocab)
print(f"Test Accuracy: {test_accuracy:.2f}%")


Test Accuracy: 57.11%
