In [1]:
# Install necessary packages
!pip install torch
!pip install transformers
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [4]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from tqdm.notebook import tqdm

In [6]:
# Load your dataset
data_path = "assets/transcripts_notes_long.csv"
dataset = load_dataset('csv', data_files=data_path)

# Split your dataset into training and validation sets
train_size = int(0.8 * len(dataset['train']))
valid_size = len(dataset['train']) - train_size

train_dataset, valid_dataset = torch.utils.data.random_split(dataset['train'], [train_size, valid_size])


In [7]:
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length=1024, max_output_length=350):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] if self.texts[idx] is not None else ""
        summary = self.summaries[idx] if self.summaries[idx] is not None else ""

        inputs = self.tokenizer.encode_plus(
            text,
            max_length=self.max_input_length,
            return_tensors='pt',
            truncation=True,
            padding='max_length'
        )

        labels = self.tokenizer.encode(
            summary,
            max_length=self.max_output_length,
            return_tensors='pt',
            truncation=True,
            padding='max_length'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }


In [8]:
# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')



In [10]:
# Create instances of the custom dataset
train_data = SummarizationDataset(train_dataset.dataset['lecture'], train_dataset.dataset['answer'], tokenizer)
valid_data = SummarizationDataset(valid_dataset.dataset['lecture'], valid_dataset.dataset['answer'], tokenizer)


["Hey there, I'm Mike Rugnetta, this is Crash Course Theater, and today we'll be looking at the surviving literature of Roman drama. Because Roman life wasn't all naval battles, naked miming prostitutes, and Christians being eaten by lions. Sometimes, you had to take a break and go watch a play. Much like Roman deities, the most popular form of Roman drama were comedies that borrow heavily from Greek originals, especially the comedies of Menander, with a little bit of Attilan farce thrown into the mix. These comedies are called fabulae paliatae. They have outdoor, urban settings, and are filled with stock characters. The hero of the play is typically a type known as the adulas case, a young man who is in love with the girl next door, the virgo, whom he can't marry because she's of dubious parentage, or he's in love with a prostitute, the miratrix. Then there's the senex, usually the father or the old man, who's either a strict miser or a loose skirt chaser. Other characters are the ser

In [27]:
# Create data loaders
train_loader = DataLoader(train_data, batch_size=2, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=2, shuffle=False)


In [28]:
# Move the model to the GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [29]:
# Define the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 2)




In [30]:
# Fine-tune the model
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_loader)
    print(f'Train Loss: {avg_loss}')

    # Validate the model
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in tqdm(valid_loader, desc=f'Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(valid_loader)
    print(f'Validation Loss: {avg_val_loss}')


Epoch 1/5:   0%|          | 0/40 [00:00<?, ?it/s]

Train Loss: 4.5611927807331085


Epoch 2/5:   0%|          | 0/40 [00:00<?, ?it/s]

Train Loss: 3.204365313053131


Epoch 3/5:   0%|          | 0/40 [00:00<?, ?it/s]

Train Loss: 2.953966099023819


Epoch 4/5:   0%|          | 0/40 [00:00<?, ?it/s]

Train Loss: 3.024439609050751


Epoch 5/5:   0%|          | 0/40 [00:00<?, ?it/s]

Train Loss: 2.9840030670166016


In [32]:
# Save the fine-tuned model
model.save_pretrained('fine_tuned_bart_model')
tokenizer.save_pretrained('fine_tuned_bart_model')


('fine_tuned_bart_model/tokenizer_config.json',
 'fine_tuned_bart_model/special_tokens_map.json',
 'fine_tuned_bart_model/vocab.json',
 'fine_tuned_bart_model/merges.txt',
 'fine_tuned_bart_model/added_tokens.json')

In [6]:
# Load the fine-tuned model and tokenizer
model = BartForConditionalGeneration.from_pretrained('fine_tuned_bart_model')
tokenizer = BartTokenizer.from_pretrained('fine_tuned_bart_model')

# Sample lecture transcript
test_lec = ""

with open('assets/test.txt', 'r') as f:
    test_lec += f.read()


# Tokenize and generate summary
inputs = tokenizer(test_lec, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs['input_ids'], max_length=650, num_beams=4, length_penalty=2.0, early_stopping=True)

# Decode the generated summary
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", generated_summary)

Generated Summary: Topic: Citizen Kane

- Citizen Kane was a great film, but it came from ambition and some unusual circumstances with Orson Welles at the helm.
- The film's technical cleverness and story still resonate with audiences today
- It was a long time since its release, in 1941, and it resonated with audiences across the Golden Age of Hollywood
- Charles Foster Kane's deathbed in a massive mansion called Xanadu, dramatically murmuring his last word
- There were a lot of unusual circumstances that made it difficult for Kane to make the film, including poor lighting, dazzling special effects, and characters who were just straight-up stereotypes
- In 1941, the film was directed and narrated by Orson Wells, who was an experienced screenwriter by the time he worked on the project
- Kane's final words were revealed in flashbacks through interviews with people Kane once knew.
  - The story is told in flashbacks, with Kane's last word appearing in flashback through interview with Kan