### DEEP LEARNING

## **TEXT SUMMARIZATION MODEL**
## **BASIC SEQUENCE TO SEQUENCE MODEL**

**IMPORTS**

In [5]:
!pip install transformers rouge_score
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import torch.nn.functional as F
import ast
import re
import torch.optim as optim
from tqdm import tqdm
import copy
from rouge_score import rouge_scorer

torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Collecting rouge_score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Collecting absl-py (from rouge_score)
  Using cached absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting nltk (from rouge_score)
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk->rouge_score)
  Downloading click-8.2.0-py3-none-any.whl.metadata (2.5 kB)
Using cached absl_py-2.2.2-py3-none-any.whl (135 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading click-8.2.0-py3-none-any.whl (102 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, absl-py, nltk, rouge_score
Successfully installed absl-py-2.2.2 click-8.2.0 nltk-3.9.1 rouge_score-0.1.2
Using device: cuda


**DATA LOADING AND PREPROCESSING**

In [6]:
train_df = pd.read_csv('../data/train.csv', header=None)
test_df = pd.read_csv('../data/test.csv', header=None)
val_df = pd.read_csv('../data/validation.csv', header=None)

columns = ['id', 'article', 'summary']
train_df.columns = columns
test_df.columns = columns
val_df.columns = columns

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

def clean_article_heading(article):
    pattern = r'By\s*\.\s*.*?\s*\.\s*PUBLISHED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.\s*\|\s*\.\s*UPDATED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.'
    cleaned_text = re.sub(pattern, '', article)
    return cleaned_text.strip()

train_df['article'] = train_df['article'].apply(preprocess_text)
train_df['article'] = train_df['article'].apply(clean_article_heading)
train_df['summary'] = train_df['summary'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(clean_article_heading)
test_df['summary'] = test_df['summary'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(clean_article_heading)
val_df['summary'] = val_df['summary'].apply(preprocess_text)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f"Training dataframe shape: {train_df.shape}")
print(f"Test dataframe shape: {test_df.shape}")
print(f"Validation dataframe shape: {val_df.shape}")

Training dataframe shape: (287114, 3)
Test dataframe shape: (11491, 3)
Validation dataframe shape: (13369, 3)


**Amount reduction for training time optimization**

In [7]:
train_df = train_df.sample(n=20000, random_state=42)
test_df = test_df.sample(n=2000, random_state=42)
val_df = val_df.sample(n=2000, random_state=42)

In [8]:
print(train_df['id'].iloc[1])
print(train_df['article'].iloc[1])
print(train_df['summary'].iloc[1])
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

023b9cf4dfeca75635957a73408fde39705c9d4f
By . James Rush . A father-of-two left his wedding ring and watch to give to his children in case something happened to him before boarding the missing Malaysia Airlines flight as he flew out to start a dream job in Mongolia. Mechanical engineer Paul Weeks, 39, of Perth, Australia, was on the flight as he made his way to his first shift in a fly-in fly-out job. His wife Danica has revealed how he left the objects with her to give to their two boys if something was to happen to him. Father-of-two Paul Weeks was on board the missing flight MH370 as he flew out to Mongolia to start a dream job . Mrs Weeks however said on Sunday that she was praying for a miracle as she waited for news of him. She told 9News National in Perth: '(He said) "If something should happen to me then the wedding ring should go to the first son that gets married and then the watch to the second".' The former soldier, who was born in New Zealand, moved his young family to Per

**CUSTOM DATASET**

In [9]:
class NewsDataset(Dataset):
    def __init__(self, articles, summaries, tokenizer, max_length=512):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.articles)
    
    def __getitem__(self, idx):
        article = str(self.articles[idx])
        summary = str(self.summaries[idx])
        
        article_encoding = self.tokenizer(
            article,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        summary_encoding = self.tokenizer(
            summary,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'article_input_ids': article_encoding['input_ids'].flatten(),
            'article_attention_mask': article_encoding['attention_mask'].flatten(),
            'summary_input_ids': summary_encoding['input_ids'].flatten(),
            'summary_attention_mask': summary_encoding['attention_mask'].flatten()
        }

train_dataset = NewsDataset(
    train_df['article'].values,
    train_df['summary'].values,
    tokenizer
)

val_dataset = NewsDataset(
    val_df['article'].values,
    val_df['summary'].values,
    tokenizer
)

test_dataset = NewsDataset(
    test_df['article'].values,
    test_df['summary'].values,
    tokenizer
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Testing samples: {len(test_dataset)}")

Training samples: 20000
Validation samples: 2000
Testing samples: 2000


### **BASIC MODEL USING LSTMs**

**MODEL DEFINITION**

In [10]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, 
                           dropout=dropout, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        # src: [batch_size, src_len]
        embedded = self.dropout(self.embedding(src))
        # Pack padded sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, src_mask.sum(1).cpu(), batch_first=True, enforce_sorted=False
        )
        outputs, (hidden, cell) = self.lstm(packed_embedded)
        # Unpack sequence
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        return outputs, hidden, cell

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 3, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
        
    def forward(self, hidden, encoder_outputs, mask):
        # hidden: [batch_size, hidden_dim]
        # encoder_outputs: [batch_size, src_len, hidden_dim * 2]
        batch_size, src_len, hidden_dim = encoder_outputs.shape
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        
        attention = attention.masked_fill(mask == 0, float('-inf'))
        return F.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.attention = Attention(hidden_dim)
        # Modify input size to account for bidirectional encoder
        self.lstm = nn.LSTM(hidden_dim * 2 + embed_dim, hidden_dim, n_layers, 
                           dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim * 3 + embed_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell, encoder_outputs, mask):
        # input: [batch_size, 1]
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        
        # Modify hidden state handling for bidirectional encoder
        # Take only the last n_layers entries from hidden state
        hidden = hidden[-self.lstm.num_layers:]
        cell = cell[-self.lstm.num_layers:]
        
        a = self.attention(hidden[-1], encoder_outputs, mask)
        a = a.unsqueeze(1)
        
        weighted = torch.bmm(a, encoder_outputs)
        
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
        
        embedded = embedded.squeeze(1)
        output = output.squeeze(1)
        weighted = weighted.squeeze(1)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden, cell

class Seq2SeqSummarizer(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.encoder = Encoder(vocab_size, embed_dim, hidden_dim, n_layers, dropout)
        self.decoder = Decoder(vocab_size, embed_dim, hidden_dim, n_layers, dropout)
        
    def forward(self, src, src_mask, trg, teacher_forcing_ratio=0.5):
        # src: [batch_size, src_len]
        # trg: [batch_size, trg_len]
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        vocab_size = self.decoder.fc_out.out_features
        
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(src.device)
        
        encoder_outputs, hidden, cell = self.encoder(src, src_mask)
        
        # First input to the decoder is the <sos> token
        input = trg[:, 0]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, 
                                              encoder_outputs, src_mask)
            outputs[:, t] = output
            
            # Teacher forcing
            teacher_force = torch.rand(1) < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = trg[:, t] if teacher_force else top1
            
        return outputs

MODEL INITIALIZATION

In [11]:

VOCAB_SIZE = len(tokenizer.vocab)
EMBED_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5

model = Seq2SeqSummarizer(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)

**EPOCH AND MODEL TRAINING FUNCTION DEFINITION**

In [13]:
def train_epoch(model, dataloader, criterion, optimizer, device, clip=1.0):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        
        article_input_ids = batch['article_input_ids'].to(device)
        article_attention_mask = batch['article_attention_mask'].to(device)
        summary_input_ids = batch['summary_input_ids'].to(device)
        
        outputs = model(article_input_ids, article_attention_mask, summary_input_ids)
        
        # Reshape outputs and target for loss calculation
        outputs = outputs[:, 1:].contiguous().view(-1, outputs.shape[-1])
        target = summary_input_ids[:, 1:].contiguous().view(-1)
        
        loss = criterion(outputs, target)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def train_model(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10, early_stopping_patience=3):
    model.to(device)
    best_val_loss = float('inf')
    early_stop_counter = 0
    training_stats = []
    best_model_state = None

    print("Starting training...")
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, rouge_scores = evaluate(model, val_loader, criterion, device)
        
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_loss:.4f}")
        print("ROUGE Scores:", rouge_scores)
        
        training_stats.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'rouge_scores': rouge_scores
        })
        
        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            best_model_state = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), 'best_summarizer_model.pth')
            print(f"Saved new best model with validation loss: {val_loss:.4f}")
        else:
            early_stop_counter += 1
            print(f"Early stopping counter: {early_stop_counter}/{early_stopping_patience}")
            
        if early_stop_counter >= early_stopping_patience:
            print("Early stopping triggered!")
            break
    
    # Load the best model before returning
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print("Restored best model state before returning")
    
    return model, training_stats


**MODEL EVALUATION FUNCTION DEFINITION**

In [14]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            article_input_ids = batch['article_input_ids'].to(device)
            article_attention_mask = batch['article_attention_mask'].to(device)
            summary_input_ids = batch['summary_input_ids'].to(device)
            
            outputs = model(article_input_ids, article_attention_mask, summary_input_ids, 
                          teacher_forcing_ratio=0.0)
            
            # Calculate loss
            outputs_flat = outputs[:, 1:].contiguous().view(-1, outputs.shape[-1])
            target_flat = summary_input_ids[:, 1:].contiguous().view(-1)
            loss = criterion(outputs_flat, target_flat)
            total_loss += loss.item()
            
            # Calculate ROUGE scores
            predictions = torch.argmax(outputs, dim=-1)
            for pred, target in zip(predictions, summary_input_ids):
                pred_text = tokenizer.decode(pred, skip_special_tokens=True)
                target_text = tokenizer.decode(target, skip_special_tokens=True)
                scores = scorer.score(target_text, pred_text)
                
                for metric in rouge_scores:
                    rouge_scores[metric].append(scores[metric].fmeasure)
    
    avg_loss = total_loss / len(dataloader)
    avg_rouge_scores = {k: sum(v)/len(v) for k, v in rouge_scores.items()}
    
    return avg_loss, avg_rouge_scores

**TRAINING PROCESS**

In [15]:
NUM_EPOCHS = 3
LEARNING_RATE = 0.001
EARLY_STOPPING_PATIENCE = 2

# Initialize optimizer and criterion
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Train the model with early stopping
model, training_stats = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    num_epochs=NUM_EPOCHS,
    early_stopping_patience=EARLY_STOPPING_PATIENCE
)

print("Training completed!")

Starting training...

Epoch 1/3


Training: 100%|██████████| 2500/2500 [20:49<00:00,  2.00it/s]
Evaluating: 100%|██████████| 250/250 [00:38<00:00,  6.42it/s]


Train Loss: 7.2370
Val Loss: 7.3687
ROUGE Scores: {'rouge1': 0.08583217072380325, 'rouge2': 0.005423276608806084, 'rougeL': 0.0809918850896766}
Saved new best model with validation loss: 7.3687

Epoch 2/3


Training: 100%|██████████| 2500/2500 [20:49<00:00,  2.00it/s]
Evaluating: 100%|██████████| 250/250 [00:39<00:00,  6.38it/s]


Train Loss: 6.8251
Val Loss: 7.2356
ROUGE Scores: {'rouge1': 0.09562297230630697, 'rouge2': 0.005657155349005023, 'rougeL': 0.08491355812301712}
Saved new best model with validation loss: 7.2356

Epoch 3/3


Training: 100%|██████████| 2500/2500 [20:51<00:00,  2.00it/s]
Evaluating: 100%|██████████| 250/250 [00:39<00:00,  6.32it/s]


Train Loss: 6.5690
Val Loss: 7.2119
ROUGE Scores: {'rouge1': 0.1242642444707073, 'rouge2': 0.009971465347044135, 'rougeL': 0.10986501142064836}
Saved new best model with validation loss: 7.2119
Restored best model state before returning
Training completed!


**FINAL EVALUATION**

In [23]:
print("Final evaluation using test data")
test_loss, test_rouge_scores = evaluate(model, test_loader, criterion, device)
print("Final test results")
for metric, score in test_rouge_scores.items():
    print(f"{metric}: {score:.6f}")

Final evaluation using test data


Evaluating: 100%|██████████| 250/250 [00:39<00:00,  6.34it/s]

Final test results
rouge1: 0.126447
rouge2: 0.010083
rougeL: 0.111677





**DATA AND BATCH SIZE AUGMENTATION**

In [16]:
train_df = pd.read_csv('../data/train.csv', header=None)
test_df = pd.read_csv('../data/test.csv', header=None)
val_df = pd.read_csv('../data/validation.csv', header=None)

columns = ['id', 'article', 'summary']
train_df.columns = columns
test_df.columns = columns
val_df.columns = columns

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

def clean_article_heading(article):
    pattern = r'By\s*\.\s*.*?\s*\.\s*PUBLISHED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.\s*\|\s*\.\s*UPDATED:\s*\.\s*\d+:\d+\s*EST,\s*\d+\s*[A-Za-z]+\s*\d+\s*\.'
    cleaned_text = re.sub(pattern, '', article)
    return cleaned_text.strip()

train_df['article'] = train_df['article'].apply(preprocess_text)
train_df['article'] = train_df['article'].apply(clean_article_heading)
train_df['summary'] = train_df['summary'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(clean_article_heading)
test_df['summary'] = test_df['summary'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(clean_article_heading)
val_df['summary'] = val_df['summary'].apply(preprocess_text)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f"Training dataframe shape: {train_df.shape}")
print(f"Test dataframe shape: {test_df.shape}")
print(f"Validation dataframe shape: {val_df.shape}")

Training dataframe shape: (287114, 3)
Test dataframe shape: (11491, 3)
Validation dataframe shape: (13369, 3)


In [None]:
train_df_40K = train_df.sample(n=40000, random_state=42)
test_df_4K = test_df.sample(n=4000, random_state=42)
val_df_4K = val_df.sample(n=4000, random_state=42)

In [20]:
train_dataset_40K = NewsDataset(
    train_df_40K['article'].values,
    train_df_40K['summary'].values,
    tokenizer
)

val_dataset_40K = NewsDataset(
    val_df_4K['article'].values,
    val_df_4K['summary'].values,
    tokenizer
)

test_dataset_40K = NewsDataset(
    test_df_4K['article'].values,
    test_df_4K['summary'].values,
    tokenizer
)

# Create data loaders
train_loader_40K = DataLoader(train_dataset_40K, batch_size=32, shuffle=True)
val_loader_40K = DataLoader(val_dataset_40K, batch_size=32, shuffle=False)
test_loader_40K = DataLoader(test_dataset_40K, batch_size=32, shuffle=False)

print(f"Training samples: {len(train_dataset_40K)}")
print(f"Validation samples: {len(val_dataset_40K)}")
print(f"Testing samples: {len(test_dataset_40K)}")

Training samples: 40000
Validation samples: 4000
Testing samples: 4000


In [24]:
VOCAB_SIZE = len(tokenizer.vocab)
EMBED_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5

model_40K = Seq2SeqSummarizer(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)

NUM_EPOCHS = 3
LEARNING_RATE = 0.001
EARLY_STOPPING_PATIENCE = 2

# Initialize optimizer and criterion
optimizer = optim.Adam(model_40K.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Train the model with early stopping
model_40K, training_stats = train_model(
    model=model_40K,
    train_loader=train_loader_40K,
    val_loader=val_loader_40K,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    num_epochs=NUM_EPOCHS,
    early_stopping_patience=EARLY_STOPPING_PATIENCE
)

Starting training...

Epoch 1/3


Training: 100%|██████████| 1250/1250 [22:08<00:00,  1.06s/it]
Evaluating: 100%|██████████| 125/125 [00:56<00:00,  2.23it/s]


Train Loss: 7.0724
Val Loss: 7.1676
ROUGE Scores: {'rouge1': 0.1023710154628591, 'rouge2': 0.007884151610780437, 'rougeL': 0.09033332421922521}
Saved new best model with validation loss: 7.1676

Epoch 2/3


Training: 100%|██████████| 1250/1250 [22:06<00:00,  1.06s/it]
Evaluating: 100%|██████████| 125/125 [00:56<00:00,  2.23it/s]


Train Loss: 6.5658
Val Loss: 7.0702
ROUGE Scores: {'rouge1': 0.12753616370214887, 'rouge2': 0.010722532616191188, 'rougeL': 0.10685668863558208}
Saved new best model with validation loss: 7.0702

Epoch 3/3


Training: 100%|██████████| 1250/1250 [22:06<00:00,  1.06s/it]
Evaluating: 100%|██████████| 125/125 [00:56<00:00,  2.22it/s]


Train Loss: 6.2682
Val Loss: 7.0101
ROUGE Scores: {'rouge1': 0.12279514189694105, 'rouge2': 0.012776049571780575, 'rougeL': 0.10488697211843771}
Saved new best model with validation loss: 7.0101
Restored best model state before returning


In [25]:
print("Final evaluation using test data")
test_loss, test_rouge_scores = evaluate(model_40K, test_loader_40K, criterion, device)
print("Final test results")
for metric, score in test_rouge_scores.items():
    print(f"{metric}: {score:.6f}")

Final evaluation using test data


Evaluating: 100%|██████████| 125/125 [00:56<00:00,  2.23it/s]

Final test results
rouge1: 0.123692
rouge2: 0.012524
rougeL: 0.106011





### **BASIC MODEL USING GRUs**

In [None]:
# First, let's add the necessary imports
from gensim.models import KeyedVectors
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load pre-trained GloVe embeddings
def load_glove_embeddings(path='../data/glove.6B.300d.txt'):
    word2vec = {}
    embedding_dim = 300
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word2vec[word] = vector
    return word2vec, embedding_dim

# Text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    return ' '.join(tokens)

# Modified Dataset class to use GloVe embeddings
class NewsDatasetGloVe(Dataset):
    def __init__(self, articles, summaries, word2vec, max_length=512):
        self.articles = articles
        self.summaries = summaries
        self.word2vec = word2vec
        self.max_length = max_length
        self.embedding_dim = len(next(iter(word2vec.values())))
        
    def __len__(self):
        return len(self.articles)
    
    def text_to_vector(self, text, max_len):
        words = text.split()[:max_len]
        vectors = []
        for word in words:
            if word in self.word2vec:
                vectors.append(self.word2vec[word])
            else:
                vectors.append(np.zeros(self.embedding_dim))
        
        # Pad if necessary
        while len(vectors) < max_len:
            vectors.append(np.zeros(self.embedding_dim))
        
        return torch.tensor(vectors, dtype=torch.float32)
    
    def __getitem__(self, idx):
        article = preprocess_text(str(self.articles[idx]))
        summary = preprocess_text(str(self.summaries[idx]))
        
        article_vector = self.text_to_vector(article, self.max_length)
        summary_vector = self.text_to_vector(summary, 128)
        
        return {
            'article_vector': article_vector,
            'summary_vector': summary_vector
        }

# GRU-based Encoder
class GRUEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers,
                         dropout=dropout, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # src: [batch_size, src_len, embedding_dim]
        outputs, hidden = self.gru(self.dropout(src))
        return outputs, hidden

# GRU-based Decoder with Attention
class GRUDecoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.attention = Attention(hidden_dim)
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.gru = nn.GRU(hidden_dim * 2 + embed_dim, hidden_dim, n_layers,
                         dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim * 3 + embed_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        
        hidden = hidden[-self.gru.num_layers:]
        
        a = self.attention(hidden[-1], encoder_outputs, mask)
        a = a.unsqueeze(1)
        
        weighted = torch.bmm(a, encoder_outputs)
        
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.gru(rnn_input, hidden)
        
        embedded = embedded.squeeze(1)
        output = output.squeeze(1)
        weighted = weighted.squeeze(1)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden

# Seq2Seq model with GRU
class Seq2SeqGRU(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        encoder_outputs, hidden = self.encoder(src)
        
        input = trg[:, 0]
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs,
                                        torch.ones_like(src[:,:,0]))
            outputs[:, t] = output
            teacher_force = torch.rand(1) < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = trg[:, t] if teacher_force else top1
            
        return outputs

# Initialize model with GloVe embeddings
print("Loading GloVe embeddings...")
word2vec, EMBEDDING_DIM = load_glove_embeddings()

# Create datasets with GloVe embeddings
train_dataset_gru = NewsDatasetGloVe(
    train_df['article'].values,
    train_df['summary'].values,
    word2vec
)

val_dataset_gru = NewsDatasetGloVe(
    val_df['article'].values,
    val_df['summary'].values,
    word2vec
)

test_dataset_gru = NewsDatasetGloVe(
    test_df['article'].values,
    test_df['summary'].values,
    word2vec
)

# Create dataloaders
train_loader_gru = DataLoader(train_dataset_gru, batch_size=8, shuffle=True)
val_loader_gru = DataLoader(val_dataset_gru, batch_size=8, shuffle=False)
test_loader_gru = DataLoader(test_dataset_gru, batch_size=8, shuffle=False)

# Model hyperparameters
INPUT_DIM = EMBEDDING_DIM
HIDDEN_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# Initialize the model
encoder = GRUEncoder(INPUT_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
decoder = GRUDecoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT)
model_gru = Seq2SeqGRU(encoder, decoder, device).to(device)

# Training parameters
optimizer_gru = optim.Adam(model_gru.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Train the model
print("Training GRU model...")
model_gru, training_stats_gru = train_model(
    model=model_gru,
    train_loader=train_loader_gru,
    val_loader=val_loader_gru,
    optimizer=optimizer_gru,
    criterion=criterion,
    device=device,
    num_epochs=NUM_EPOCHS,
    early_stopping_patience=EARLY_STOPPING_PATIENCE
)

# Evaluate on test set
print("\nFinal evaluation using test data")
test_loss, test_rouge_scores = evaluate(model_gru, test_loader_gru, criterion, device)
print("\nFinal test results for GRU model:")
for metric, score in test_rouge_scores.items():
    print(f"{metric}: {score:.6f}")