<a href="https://colab.research.google.com/github/ashagedo/MSCI-641-project/blob/main/1_DistilBART_CNN_summarize_only_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import os
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback, DataCollatorForSeq2Seq
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight


In [2]:
# Import the files.upload() function from the google.colab module
from google.colab import files

# Upload files using the files.upload() function
uploaded_files = files.upload()

# Access the uploaded files
for filename in uploaded_files.keys():
    print('Uploaded file:', filename)
    with open(filename, 'r') as file:
        train_file = file.read()

Saving train.jsonl to train.jsonl
Uploaded file: train.jsonl


In [3]:
# Import the files.upload() function from the google.colab module
from google.colab import files

# Upload files using the files.upload() function
uploaded_files = files.upload()

# Access the uploaded files
for filename in uploaded_files.keys():
    print('Uploaded file:', filename)
    with open(filename, 'r') as file:
        val_file = file.read()

Saving val.jsonl to val.jsonl
Uploaded file: val.jsonl


In [4]:
# Load data into DataFrames
train_data = pd.read_json(train_file, lines=True)
val_data = pd.read_json(val_file, lines=True)

In [5]:
class CustomDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_len):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = ' '.join(self.summaries[idx]) if isinstance(self.summaries[idx], list) else self.summaries[idx]

        # Tokenize the input text
        input_encodings = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize the summary (target) text
        target_encodings = self.tokenizer(
            summary,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        '''
        # Debugging statements
        print(f"Index: {idx}")
        print(f"Original Text: {text}")
        print(f"Original Summary: {summary}")
        print(f"Tokenized Input Shape: {input_encodings['input_ids'].shape}")
        print(f"Tokenized Labels Shape: {target_encodings['input_ids'].shape}")
        print(f"Tokenized Input: {input_encodings['input_ids']}")
        print(f"Tokenized Labels: {target_encodings['input_ids']}")
        '''
        return {
            'input_ids': input_encodings['input_ids'].squeeze().to(device),
            'attention_mask': input_encodings['attention_mask'].squeeze().to(device),
            'labels': target_encodings['input_ids'].squeeze().to(device)
        }

#Custom trainer
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        loss = outputs['loss'] if 'loss' in outputs else outputs[0]  # Accessing the loss from the outputs
        return (loss, outputs) if return_outputs else loss

    def training_step(self, model, inputs):
        model.train()
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)

        # Check if 'loss' is a key in the outputs dictionary and access it
        loss = outputs['loss'] if 'loss' in outputs else outputs[0]

        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()
        self.optimizer.zero_grad()

        return loss.detach()



# Custom Save Model Callback
class SaveModelCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        model = kwargs['model']
        tokenizer = kwargs['tokenizer']
        epoch_output_dir = f"{args.output_dir}/epoch_{state.epoch + 1}"
        model.save_pretrained(epoch_output_dir)
        tokenizer.save_pretrained(epoch_output_dir)
        print(f"Model saved at {epoch_output_dir} after epoch {state.epoch + 1}")


# Custom model without classification head
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        self.transformer = BartForConditionalGeneration.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask=None, labels=None, decoder_input_ids=None):
        # Get outputs from the transformer model
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_input_ids=decoder_input_ids)
        summarization_loss = outputs.loss

        return {
            "loss": summarization_loss,
            "summarization_loss": summarization_loss,
            "logits": outputs.logits,
            "encoder_last_hidden_state": outputs.encoder_last_hidden_state,
            "past_key_values": outputs.past_key_values,
            "decoder_hidden_states": outputs.decoder_hidden_states,
            "decoder_attentions": outputs.decoder_attentions,
            "cross_attentions": outputs.cross_attentions,
            "encoder_hidden_states": outputs.encoder_hidden_states,
            "encoder_attentions": outputs.encoder_attentions,
        }

    def save_pretrained(self, save_directory):
        self.transformer.save_pretrained(save_directory)

    @classmethod
    def from_pretrained(cls, model_name_or_path):
        model = cls(model_name_or_path)
        model.transformer = BartForConditionalGeneration.from_pretrained(model_name_or_path)
        return model

# Pre-process Data
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)  # Remove special characters, but keep some punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Model and Tokenizer Initialization
model_name_or_path = 'sshleifer/distilbart-cnn-12-6'
tokenizer = BartTokenizer.from_pretrained(model_name_or_path)
model = CustomModel(model_name_or_path).to(device)
max_len = 1024
batch_size=2

# Data Preparation (Assuming train_data, val_data are loaded as pandas DataFrames)
fraction = 0.1  # 10% of the data
train_data_sampled = train_data.sample(frac=fraction, random_state=42)
#train_data_sampled['concatenated_text'] = train_data_sampled.apply(lambda row: ' '.join(row['postText']) + ' ' + ' '.join(row['targetTitle']) + ' ' + ' '.join(row['targetParagraphs']), axis=1)

# Define labels
postText_label = "postText"
targetTitle_label = "targetTitle"
targetParagraphs_label = "targetParagraphs"
train_data_sampled['concatenated_text'] = train_data_sampled.apply(
    lambda row: f"{postText_label} {' '.join(row['postText'])} "
                f"{targetTitle_label} {' '.join(row['targetTitle'])} "
                f"{targetParagraphs_label} {' '.join(row['targetParagraphs'])}",
    axis=1
)
train_inputs = train_data_sampled['concatenated_text'].tolist()
train_inputs = [preprocess_text(text) for text in train_inputs]

train_summaries = train_data_sampled['spoiler'].tolist()
train_summaries = [
    preprocess_text(' '.join(text) if isinstance(text, list) else text)
    for text in train_summaries]


#val_data['concatenated_text'] = val_data.apply(lambda row: ' '.join(row['postText']) + ' ' + ' '.join(row['targetTitle']) + ' ' + ' '.join(row['targetParagraphs']), axis=1)
# Define labels
postText_label = "postText"
targetTitle_label = "targetTitle"
targetParagraphs_label = "targetParagraphs"
val_data['concatenated_text'] = val_data.apply(
    lambda row: f"{postText_label} {' '.join(row['postText'])} "
                f"{targetTitle_label} {' '.join(row['targetTitle'])} "
                f"{targetParagraphs_label} {' '.join(row['targetParagraphs'])}",
    axis=1
)
val_inputs = val_data['concatenated_text'].tolist()
val_inputs = [preprocess_text(text) for text in val_inputs]


val_summaries = val_data['spoiler'].tolist()
val_summaries = [
    preprocess_text(' '.join(text) if isinstance(text, list) else text)
    for text in val_summaries]



# Create DataLoader Using CustomDataset
batch_size = 2
train_dataset = CustomDataset(train_inputs, train_summaries, tokenizer, max_len)
val_dataset = CustomDataset(val_inputs, val_summaries, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)


# Training Arguments

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/DistilBART_summarize_only_5epochs',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=1e-5,
    logging_dir='./logs',
    logging_steps=500,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=5,
    weight_decay=0.001,
    warmup_steps=100,
    fp16=False,  # Ensure this is set to False
)

optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=len(train_loader) * training_args.num_train_epochs)
save_model_callback = SaveModelCallback()

# Initialize the Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[save_model_callback],  # Note the list brackets
    optimizers=(optimizer, scheduler)
)

# Train the model
trainer.train()



Using device: cuda




Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Model saved at /content/drive/MyDrive/DistilBART_summarize_only_5epochs/epoch_2.0 after epoch 2.0


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Model saved at /content/drive/MyDrive/DistilBART_summarize_only_5epochs/epoch_3.0 after epoch 3.0


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Model saved at /content/drive/MyDrive/DistilBART_summarize_only_5epochs/epoch_4.0 after epoch 4.0


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Model saved at /content/drive/MyDrive/DistilBART_summarize_only_5epochs/epoch_5.0 after epoch 5.0


RuntimeError: 
            Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'transformer.model.encoder.embed_tokens.weight', 'transformer.model.decoder.embed_tokens.weight', 'transformer.lm_head.weight', 'transformer.model.shared.weight'}].
            A potential way to correctly save your model is to use `save_model`.
            More information at https://huggingface.co/docs/safetensors/torch_shared_tensors
            