In [14]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

# Download NLTK data
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/alsu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
def preprocess_text_pair(row, length_threshold=50):
    reference = row['reference']
    translation = row['translation']
    length_diff = torch.tensor([row['lenght_diff']])  # Convert to tensor
    similarity = torch.tensor([row['similarity']])  # Convert to tensor

    # Tokenization
    reference_tokens = word_tokenize(reference)
    translation_tokens = word_tokenize(translation)

    # Length normalization
    pad_token = '<pad>'
    padded_ref = reference_tokens[:length_threshold] + [pad_token] * max(0, length_threshold - len(reference_tokens))
    padded_trans = translation_tokens[:length_threshold] + [pad_token] * max(0, length_threshold - len(translation_tokens))

    return padded_ref, padded_trans, length_diff, similarity

def preprocess_dataset(dataset, length_threshold=50):
    preprocessed_data = pd.DataFrame()

    (
        preprocessed_data['reference'],
        preprocessed_data['translation'],
        preprocessed_data['lenght_diff'],
        preprocessed_data['similarity']
    ) = zip(*dataset.apply(lambda row: preprocess_text_pair(row, length_threshold), axis=1))

    return preprocessed_data

In [16]:
# Custom Dataset Class
class ParaphraseDataset(Dataset):
    def __init__(self, data, test=False):
        self.data = data
        self.test = test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        reference = self.data.iloc[idx]['reference']
        translation = self.data.iloc[idx]['translation']
        length_diff = self.data.iloc[idx]['lenght_diff']
        similarity = self.data.iloc[idx]['similarity']
    
        if self.test:
                    # During testing, return only what's needed for prediction
            return {'reference': reference, 'translation': translation}
        else:
            # During training, return additional features
            return {'reference': reference, 'translation': translation, 'lenght_diff': length_diff, 'similarity': similarity}



In [17]:
# Prepare Data
from torch.nn.utils.rnn import pad_sequence
file_path = "../data/raw/filtered.tsv"
raw_df = pd.read_csv(file_path, delimiter="\t")
preprocessed_df = preprocess_dataset(raw_df)

In [18]:
preprocessed_df.to_csv('../data/interim/intermediate_dataset_1.csv', index=False)

In [19]:
# Split the dataset into train and validation
train_df, val_df = train_test_split(preprocessed_df, test_size=0.2, random_state=42)
train_df_mini = train_df[:1000]
val_df_mini = val_df[:500]
train_dataset = ParaphraseDataset(train_df_mini)
val_dataset = ParaphraseDataset(val_df_mini, True)

In [20]:
all_words = []
for _, row in preprocessed_df.iterrows():
    reference_tokens = row['reference']
    translation_tokens = row['translation']
    
    all_words.extend(reference_tokens)
    all_words.extend(translation_tokens)

vocab = {word: idx for idx, word in enumerate(set(all_words))}
vocab_size = len(vocab)

In [21]:
# Hyperparameters
embedding_dim = 100  
hidden_dim = 128 
length_threshold = 50

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleParaphraseModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, sequence_length):
        super(SimpleParaphraseModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, reference, length_diff, similarity):
        embeds = self.embeddings(reference)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


In [23]:
# Instantiate the paraphrase model
simple_paraphrase_model = SimpleParaphraseModel(vocab_size, embedding_dim, hidden_dim, 50).to('cuda')

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(simple_paraphrase_model.parameters(), lr=0.001)

In [24]:
def collate_fn(batch):
    references = [sample['reference'] for sample in batch]
    translations = [sample['translation'] for sample in batch]
    length_diffs = [sample['lenght_diff'] for sample in batch]
    similarities = [sample['similarity'] for sample in batch]

    # Flatten the list of tokens and convert them to tensors
    all_reference_tokens = [token for reference_tokens in references for token in reference_tokens]
    all_translation_tokens = [token for translation_tokens in translations for token in translation_tokens]

    # Build the vocabulary from the tokens
    vocab = {'<pad>': 0, '<unk>': 1}
    vocab.update({word: idx + len(vocab) for idx, word in enumerate(set(all_reference_tokens + all_translation_tokens))})

    # Convert tokens to indices and pad sequences to length 50
    references_padded = pad_sequence(
        [torch.tensor([vocab.get(token, vocab['<unk>']) for token in reference_tokens][:50]) for reference_tokens in references],
        batch_first=True,
        padding_value=vocab['<pad>']
    )

    translations_padded = pad_sequence(
        [torch.tensor([vocab.get(token, vocab['<unk>']) for token in translation_tokens][:50]) for translation_tokens in translations],
        batch_first=True,
        padding_value=vocab['<pad>']
    )

    return {
        'reference': references_padded,
        'translation': translations_padded,
        'lenght_diff': torch.tensor(length_diffs),
        'similarity': torch.tensor(similarities)
    }

In [25]:
def collate_fn_test(batch):
    references = [sample['reference'] for sample in batch]
    translations = [sample['translation'] for sample in batch]

    # Flatten the list of tokens and convert them to tensors
    all_reference_tokens = [token for reference_tokens in references for token in reference_tokens]
    all_translation_tokens = [token for translation_tokens in translations for token in translation_tokens]

    # Build the vocabulary from the tokens
    vocab = {'<pad>': 0, '<unk>': 1}
    vocab.update({word: idx + len(vocab) for idx, word in enumerate(set(all_reference_tokens + all_translation_tokens))})

    # Convert tokens to indices and pad sequences to length 50
    references_padded = pad_sequence(
        [torch.tensor([vocab.get(token, vocab['<unk>']) for token in reference_tokens][:50]) for reference_tokens in references],
        batch_first=True,
        padding_value=vocab['<pad>']
    )

    translations_padded = pad_sequence(
        [torch.tensor([vocab.get(token, vocab['<unk>']) for token in translation_tokens][:50]) for translation_tokens in translations],
        batch_first=True,
        padding_value=vocab['<pad>']
    )

    return {
        'reference': references_padded,
        'translation': translations_padded
    }

In [26]:
# Dataloader for training and test

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn_test)

In [27]:
import torch.nn.functional as F

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    total_loss = 0
    total_perplexity = 0  # Track cumulative perplexity

    for batch in train_dataloader:
        reference = batch['reference'].to('cuda')
        translation = batch['translation'].to('cuda')
        length_diff = batch['lenght_diff'].to('cuda')
        similarity = batch['similarity'].to('cuda')
        
        # Model forward pass
        output = simple_paraphrase_model(reference, length_diff, similarity)
        loss = criterion(output.view(-1, vocab_size), translation.view(-1))

        # Compute perplexity
        perplexity = torch.exp(loss)  # Using exponential to get perplexity
        total_loss += loss.item()
        total_perplexity += perplexity.item()

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    average_perplexity = total_perplexity / len(train_dataloader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}, Perplexity: {average_perplexity:.4f}')


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.79 GiB (GPU 0; 5.80 GiB total capacity; 5.51 GiB already allocated; 135.44 MiB free; 5.52 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
import torch
import torch.nn as nn

# Testing loop
simple_paraphrase_model.eval()  # Set the model to evaluation mode
total_loss = 0

with torch.no_grad():  # Disable gradient calculation during testing
    for batch in test_dataloader:  # Assuming you have a DataLoader for testing data
        reference = batch['reference'].to('cuda')
        translation = batch['translation'].to('cuda')
        
        # Model forward pass
        output = simple_paraphrase_model(reference, length_diff, similarity)

        # Compute loss
        loss = criterion(output.view(-1, vocab_size), translation.view(-1))

        total_loss += loss.item()

average_loss = total_loss / len(test_dataloader)
print(f'Test Loss: {average_loss:.4f}')

In [None]:
checkpoint = {'model': SimpleParaphraseModel,
              'state_dict': simple_paraphrase_model.state_dict(),
              'optimizer' : optimizer.state_dict()}

torch.save(checkpoint, 'checkpoint.pth')