<a href="https://colab.research.google.com/github/ashagedo/MSCI-641-project/blob/main/1a_Inference_DistilBART_Inference_summarize_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import os
import torch
import nltk
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
import torch.nn.functional as F
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import pandas as pd
from tqdm import tqdm
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# Import the files.upload() function from the google.colab module
from google.colab import files

# Upload files using the files.upload() function
uploaded_files = files.upload()

# Access the uploaded files
for filename in uploaded_files.keys():
    print('Uploaded file:', filename)
    with open(filename, 'r') as file:
        train_file = file.read()

Saving train.jsonl to train.jsonl
Uploaded file: train.jsonl


In [3]:
# Import the files.upload() function from the google.colab module
from google.colab import files

# Upload files using the files.upload() function
uploaded_files = files.upload()

# Access the uploaded files
for filename in uploaded_files.keys():
    print('Uploaded file:', filename)
    with open(filename, 'r') as file:
        val_file = file.read()

Saving val.jsonl to val.jsonl
Uploaded file: val.jsonl


In [4]:
# Import the files.upload() function from the google.colab module
from google.colab import files

# Upload files using the files.upload() function
uploaded_files = files.upload()

# Access the uploaded files
for filename in uploaded_files.keys():
    print('Uploaded file:', filename)
    with open(filename, 'r') as file:
        test_file = file.read()

Saving test.jsonl to test.jsonl
Uploaded file: test.jsonl


In [5]:
# Load data into DataFrames
train_data = pd.read_json(train_file, lines=True)
val_data = pd.read_json(val_file, lines=True)
test_data = pd.read_json(test_file, lines=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:

# Custom model without classification head
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        self.transformer = BartForConditionalGeneration.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask=None, labels=None, decoder_input_ids=None):
        # Get outputs from the transformer model
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_input_ids=decoder_input_ids)
        summarization_loss = outputs.loss

        return {
            "loss": summarization_loss,
            "summarization_loss": summarization_loss,
            "logits": outputs.logits,
            "encoder_last_hidden_state": outputs.encoder_last_hidden_state,
            "past_key_values": outputs.past_key_values,
            "decoder_hidden_states": outputs.decoder_hidden_states,
            "decoder_attentions": outputs.decoder_attentions,
            "cross_attentions": outputs.cross_attentions,
            "encoder_hidden_states": outputs.encoder_hidden_states,
            "encoder_attentions": outputs.encoder_attentions,
        }

    def save_pretrained(self, save_directory):
        self.transformer.save_pretrained(save_directory)

    @classmethod
    def from_pretrained(cls, model_name_or_path):
        model = cls(model_name_or_path)
        model.transformer = BartForConditionalGeneration.from_pretrained(model_name_or_path)
        return model


#Pre-process Data
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)# Remove special characters, but keep some punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text



In [None]:
# Prepare test data

max_len=1024
batch_size = 32  # Adjust batch size based on your GPU memory
model_path='/content/drive/MyDrive/DistilBART_summarize_only_5epochs/epoch_5.0'

tokenizer = BartTokenizer.from_pretrained(model_path)
custom_model = CustomModel.from_pretrained(model_path)
custom_model.to(device)

# Preprocess and tokenize test data
#test_data['concatenated_text'] = test_data.apply(lambda row: ' '.join(row['postText']) + ' ' + ' '.join(row['targetTitle']) + ' ' + ' '.join(row['targetParagraphs']), axis=1)
postText_label = "postText"
targetTitle_label = "targetTitle"
targetParagraphs_label = "targetParagraphs"
test_data['concatenated_text'] = test_data.apply(
    lambda row: f"{postText_label} {' '.join(row['postText'])} "
                f"{targetTitle_label} {' '.join(row['targetTitle'])} "
                f"{targetParagraphs_label} {' '.join(row['targetParagraphs'])}",
    axis=1
)

test_inputs = test_data['concatenated_text'].tolist()
test_inputs = [preprocess_text(text) for text in test_inputs]
test_inputs = tokenizer(test_inputs, return_tensors='pt', padding=True, truncation=True, max_length=1024)  # Adjusted max_length

# Move the tokenized inputs to the GPU
test_input_ids = test_inputs['input_ids'].to(device)
test_attention_mask = test_inputs['attention_mask'].to(device)

# Create a DataLoader for the test data

test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_mask)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)


# Lists to store predictions

summary_predictions = []

custom_model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = batch

        # Ensure inputs are on the correct device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        # Text Summarization
        try:
            summary_ids = custom_model.transformer.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=50,
                num_beams=4,
                early_stopping=True
            )
            summary_texts = [
                tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) for ids in summary_ids
            ]
            summary_predictions.extend(summary_texts)
        except Exception as e:
            print(f"Error generating summary: {e}")
            summary_texts = [""] * input_ids.size(0)
            summary_ids = torch.zeros((input_ids.size(0), 1), dtype=torch.long).to(device)
            summary_predictions.extend(summary_texts)


# Save predictions to DataFrame
test_data['spoiler'] = summary_predictions
output_file = '/content/drive/MyDrive/Colab_Notebooks/DistilBART_summarization_task2_v2.csv'
test_data[['id', 'spoiler']].to_csv(output_file, index=False)
test_data[['id', 'spoiler']].to_csv('DistilBART_summarization_task2_v2.csv', index=False)
print(f"Predictions saved to {output_file}")


In [9]:
#Prepare Training and validation Data for Model Evaluation
max_len = 1024
batch_size = 8
model_path = '/content/drive/MyDrive/DistilBART_summarize_only_5epochs/epoch_5.0'

tokenizer = BartTokenizer.from_pretrained(model_path)
custom_model = CustomModel.from_pretrained(model_path)
custom_model.to(device)

#train_data['concatenated_text'] = train_data.apply(lambda row: ' '.join(row['postText']) + ' ' + ' '.join(row['targetTitle']) + ' ' + ' '.join(row['targetParagraphs']), axis=1)
postText_label = "postText"
targetTitle_label = "targetTitle"
targetParagraphs_label = "targetParagraphs"
train_data['concatenated_text'] = train_data.apply(
    lambda row: f"{postText_label} {' '.join(row['postText'])} "
                f"{targetTitle_label} {' '.join(row['targetTitle'])} "
                f"{targetParagraphs_label} {' '.join(row['targetParagraphs'])}",
    axis=1
)
train_inputs = train_data['concatenated_text'].tolist()
train_inputs = [preprocess_text(text) for text in train_inputs]
train_inputs = tokenizer(train_inputs, return_tensors='pt', padding=True, truncation=True, max_length=max_len)  # Adjusted max_length

# Move the tokenized inputs to the GPU
train_input_ids = train_inputs['input_ids'].to(device)
train_attention_mask = train_inputs['attention_mask'].to(device)

train_summaries =train_data['spoiler'].tolist()
train_summaries = [
    preprocess_text(' '.join(text) if isinstance(text, list) else text)
    for text in train_summaries]
train_labels = tokenizer(train_summaries, return_tensors='pt', padding=True, truncation=True, max_length=max_len)['input_ids'] # Adjust max_length as needed

# Move the tokenized inputs to the GPU
train_input_ids = train_inputs['input_ids'].to(device)
train_attention_mask = train_inputs['attention_mask'].to(device)
train_labels = train_labels.to(device)


#val_data['concatenated_text'] = val_data.apply(lambda row: ' '.join(row['postText']) + ' ' + ' '.join(row['targetTitle']) + ' ' + ' '.join(row['targetParagraphs']), axis=1)
postText_label = "postText"
targetTitle_label = "targetTitle"
targetParagraphs_label = "targetParagraphs"
val_data['concatenated_text'] = val_data.apply(
    lambda row: f"{postText_label} {' '.join(row['postText'])} "
                f"{targetTitle_label} {' '.join(row['targetTitle'])} "
                f"{targetParagraphs_label} {' '.join(row['targetParagraphs'])}",
    axis=1
)
val_inputs = val_data['concatenated_text'].tolist()
val_inputs = [preprocess_text(text) for text in val_inputs]
val_inputs = tokenizer(val_inputs, return_tensors='pt', padding=True, truncation=True, max_length=max_len)  # Adjusted max_length


val_summaries = val_data['spoiler'].tolist()
val_summaries = [
    preprocess_text(' '.join(text) if isinstance(text, list) else text)
    for text in val_summaries]
val_labels = tokenizer(val_summaries, return_tensors='pt', padding=True, truncation=True, max_length=max_len)['input_ids'] # Adjust max_length as needed

# Move the tokenized inputs to the GPU
val_input_ids = val_inputs['input_ids'].to(device)
val_attention_mask = val_inputs['attention_mask'].to(device)
val_labels = val_labels.to(device)

# Create DataLoaders for training and validation data

train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_mask, val_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# Evaluate on Training Set

# Lists to store predictions and labels

summary_predictions = []
reference_summaries = train_summaries

# Evaluation loop

custom_model.eval()
with torch.no_grad():
    for batch in tqdm(train_loader, desc="Evaluating"):

        input_ids, attention_mask, labels = batch  # Assuming labels are in the batch

        # Ensure inputs are on the correct device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # Text Summarization
        try:
            summary_ids = custom_model.transformer.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=50,
                num_beams=4,
                early_stopping=True
            )
            summary_texts = [
                tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) for ids in summary_ids
            ]
            summary_predictions.extend(summary_texts)

            # Add the corresponding reference summaries
            reference_summaries.extend([tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) for ids in labels])
        except Exception as e:
            print(f"Error generating summary: {e}")
            summary_texts = [""] * input_ids.size(0)
            summary_ids = torch.zeros((input_ids.size(0), 1), dtype=torch.long).to(device)
            summary_predictions.extend(summary_texts)


# Compute METEOR Scores
reference_summaries = [' '.join(tokens) if isinstance(tokens, list) else tokens for tokens in reference_summaries]
summary_predictions = [str(text) for text in summary_predictions]

meteor_scores = [
    meteor_score([word_tokenize(ref)], word_tokenize(gen))
    for ref, gen in zip(reference_summaries, summary_predictions)
]
average_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0

print(f"Average METEOR Score: {average_meteor:.4f}")



In [None]:
# Evaluate on Validation Set

# Lists to store predictions and labels

summary_predictions = []
reference_summaries = val_summaries

# Evaluation loop

custom_model.eval()
with torch.no_grad():
    for batch in tqdm(val_loader, desc="Evaluating"):

        input_ids, attention_mask, labels = batch  # Assuming labels are in the batch

        # Ensure inputs are on the correct device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # Text Summarization
        try:
            summary_ids = custom_model.transformer.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=50,
                num_beams=4,
                early_stopping=True
            )
            summary_texts = [
                tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) for ids in summary_ids
            ]
            summary_predictions.extend(summary_texts)

            # Add the corresponding reference summaries
            reference_summaries.extend([tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) for ids in labels])
        except Exception as e:
            print(f"Error generating summary: {e}")
            summary_texts = [""] * input_ids.size(0)
            summary_ids = torch.zeros((input_ids.size(0), 1), dtype=torch.long).to(device)
            summary_predictions.extend(summary_texts)


# Compute METEOR Scores

reference_summaries = [' '.join(tokens) if isinstance(tokens, list) else tokens for tokens in reference_summaries]
summary_predictions = [str(text) for text in summary_predictions]

meteor_scores = [
    meteor_score([word_tokenize(ref)], word_tokenize(gen))
    for ref, gen in zip(reference_summaries, summary_predictions)
]
average_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0

print(f"Average METEOR Score: {average_meteor:.4f}")

