In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

In [3]:
def get_youtube_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_chunks = []

        for entry in transcript:
            start = entry['start']
            end = entry['start'] + entry['duration']
            text = entry['text']
            transcript_chunks.append({
                'start': start,
                'end': end,
                'text': text
            })

        return transcript_chunks

    except Exception as e:
        print(f"Error: {e}")
        return []

In [4]:
# Example usage (replace with your own YouTube video ID)
video_url = "https://www.youtube.com/watch?v=QT2FGbR0nIM"  # Example
video_id = video_url.split("v=")[-1]

# Call the function and store the transcript
transcript_data = get_youtube_transcript(video_id)

# Print first 5 chunks for verification
for entry in transcript_data[:5]:
    print(f"[{entry['start']:.2f}s - {entry['end']:.2f}s] {entry['text']}")

[0.00s - 11.72s] Heat. Heat.
[2.84s - 11.72s] [Music]
[16.06s - 34.49s] [Music]
[34.80s - 39.84s] If you were to describe Ted the human in
[38.00s - 42.84s] three words, what would the three words


In [39]:
def chunk_transcript(transcript_data, chunk_duration=120):
    chunks = []
    current_chunk = {
        "start": transcript_data[0]["start"],
        "end": transcript_data[0]["end"],
        "text": transcript_data[0]["text"]
    }

    for entry in transcript_data[1:]:
        if entry["start"] - current_chunk["start"] <= chunk_duration:
            current_chunk["end"] = entry["end"]
            current_chunk["text"] += " " + entry["text"]
        else:
            chunks.append(current_chunk)
            current_chunk = {
                "start": entry["start"],
                "end": entry["end"],
                "text": entry["text"]
            }

    chunks.append(current_chunk)  # Append the last chunk
    return chunks

In [40]:
chunked_transcript = chunk_transcript(transcript_data, chunk_duration=60)

# Print first 3 chunks for inspection
for i, chunk in enumerate(chunked_transcript[:3]):
    print(f"\nChunk {i+1} [{chunk['start']:.2f}s - {chunk['end']:.2f}s]:\n{chunk['text'][:300]}...")


Chunk 1 [0.00s - 65.76s]:
Heat. Heat. [Music] [Music] If you were to describe Ted the human in three words, what would the three words be? [Music] Hi Ted. Thank you. Good. Thanks for having me. Thank you for coming. Uh so...

Chunk 2 [62.00s - 124.40s]:
you've been in India for 12 hours now. About about 12 hours. So how does it feel? I love being coming to India so much and it's I never get enough time here. Usually I come and I've got to they've got me working and grinding me out and then shoot me back out back in the sky. But it's a it's a it's I...

Chunk 3 [122.08s - 186.48s]:
Angeles when his son was in school. Uh so we got to go out to dinner. So going having dinner with Sharakhan in India is much different than having dinner with him in in Los Angeles. Uh but so I would say I mean that's probably who someone I know the most and really enjoy working with. So you've been...


In [41]:
print("Total number of chunks:",len(chunked_transcript))  

Total number of chunks: 112


In [42]:
from keybert import KeyBERT

# Load the default model (you can switch later if needed)
kw_model = KeyBERT(model='all-MiniLM-L6-v2')


In [43]:
# Join all chunk texts into one large document
full_text = " ".join([chunk["text"] for chunk in chunked_transcript])

# Extract keywords (phrases), adjust top_n as needed
keywords = kw_model.extract_keywords(full_text, top_n=50, stop_words='english')

# Print top 10 keywords for inspection
for kw, score in keywords[:50]:
    print(f"{kw}: {score:.4f}")


indian: 0.3540
heat: 0.3513
bollywood: 0.3446
energy: 0.3299
bombay: 0.3263
sharma: 0.3206
india: 0.2970
khan: 0.2864
entertain: 0.2857
lit: 0.2847
bangjun: 0.2741
arrive: 0.2702
shah: 0.2688
enjoying: 0.2659
ruk: 0.2636
passionate: 0.2574
bring: 0.2536
exciting: 0.2532
kyunki: 0.2518
traveling: 0.2492
loving: 0.2480
entertained: 0.2478
fleeting: 0.2478
entertaining: 0.2459
passion: 0.2456
ted: 0.2445
travel: 0.2431
power: 0.2407
dinner: 0.2396
satisfaction: 0.2391
sensation: 0.2386
love: 0.2355
lover: 0.2321
vacation: 0.2311
feel: 0.2307
reflective: 0.2304
consume: 0.2283
music: 0.2272
meet: 0.2261
bangalore: 0.2261
bringing: 0.2251
appetizing: 0.2243
uta: 0.2243
explore: 0.2241
festival: 0.2240
adventurous: 0.2237
touring: 0.2236
phenomenal: 0.2229
burned: 0.2227
met: 0.2222


In [44]:
# Create a flat list of keywords (not scores), for fast matching
top_keywords = [kw[0].lower() for kw in keywords]  # Only keep the keyword string, lowercase for match

# Score each chunk by how many top keywords appear in it
def score_chunks_by_keywords(chunks, keywords):
    scored_chunks = []
    for chunk in chunks:
        text = chunk["text"].lower()
        keyword_hits = sum(1 for kw in keywords if kw in text)
        scored_chunks.append({
            "start": chunk["start"],
            "end": chunk["end"],
            "text": chunk["text"],
            "keyword_score": keyword_hits
        })
    return scored_chunks


In [45]:
# Run the scoring function
scored_chunks = score_chunks_by_keywords(chunked_transcript, top_keywords)

# Sort by keyword density
ranked_chunks = sorted(scored_chunks, key=lambda x: x["keyword_score"], reverse=True)

# Show top 5 most keyword-dense chunks
for i, chunk in enumerate(ranked_chunks[:5]):
    print(f"\nRank {i+1} | Score: {chunk['keyword_score']} | Time: {chunk['start']:.2f}s - {chunk['end']:.2f}s\n")
    print(chunk['text'] + "...")



Rank 1 | Score: 11 | Time: 62.00s - 124.40s

you've been in India for 12 hours now. About about 12 hours. So how does it feel? I love being coming to India so much and it's I never get enough time here. Usually I come and I've got to they've got me working and grinding me out and then shoot me back out back in the sky. But it's a it's a it's I love the energy of India. What have you done so far today? Today I uh came in I met with the ministers and I met I went to the waves conference and spoke on stage and was interviewed. Uh do you have a favorite creator in India? That's like you know we work with so many it'd be like picking your kids but you know picking your favorite child almost. But I have to tell you early early in coming to India I met uh Shah Ruk Khan right away early and we just he hosted a very nice little dinner for for me and we just hit it off immediately and uh I've come back since and my with my wife and we've had a a nice times together. Uh he we visited with each o

In [46]:
from sentence_transformers import SentenceTransformer, util

# Load the sentence embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')


In [47]:
# Join top keywords into a single "summary string"
global_theme_text = " ".join([kw for kw in top_keywords[:30]])
global_theme_emb = embed_model.encode(global_theme_text, convert_to_tensor=True)

In [48]:
def add_embedding_scores(chunks, global_emb, model):
    for chunk in chunks:
        chunk_emb = model.encode(chunk["text"], convert_to_tensor=True)
        sim_score = util.pytorch_cos_sim(chunk_emb, global_emb).item()
        chunk["embedding_score"] = sim_score
    return chunks


In [49]:
scored_chunks = add_embedding_scores(scored_chunks, global_theme_emb, embed_model)


In [50]:
def hybrid_score_chunks(chunks, weight_keywords=0.5, weight_embedding=0.5):
    # Normalize scores for fair combination
    max_kw = max(chunk["keyword_score"] for chunk in chunks)
    max_emb = max(chunk["embedding_score"] for chunk in chunks)

    for chunk in chunks:
        norm_kw = chunk["keyword_score"] / max_kw if max_kw > 0 else 0
        norm_emb = chunk["embedding_score"] / max_emb if max_emb > 0 else 0
        chunk["hybrid_score"] = weight_keywords * norm_kw + weight_embedding * norm_emb

    return sorted(chunks, key=lambda x: x["hybrid_score"], reverse=True)


In [51]:
ranked_chunks = hybrid_score_chunks(scored_chunks, weight_keywords=0.5, weight_embedding=0.5)

# Preview top results
for i, chunk in enumerate(ranked_chunks[:5]):
    print(f"\nRank {i+1} | Hybrid Score: {chunk['hybrid_score']:.4f} | Time: {chunk['start']:.2f}s - {chunk['end']:.2f}s\n")
    print(chunk['text'][:300] + "...")



Rank 1 | Hybrid Score: 1.0000 | Time: 62.00s - 124.40s

you've been in India for 12 hours now. About about 12 hours. So how does it feel? I love being coming to India so much and it's I never get enough time here. Usually I come and I've got to they've got me working and grinding me out and then shoot me back out back in the sky. But it's a it's a it's I...

Rank 2 | Hybrid Score: 0.7293 | Time: 6824.48s - 6880.44s

away. Yeah. And I feel like it's never been more true right now. I mean, it just feels like there India is on a precipice of something very big. Uh and this is probably the the most exciting time. uh that I've been aware of in India's history. Done. All right. Thank you. Good time. Thanks everyone. ...

Rank 3 | Hybrid Score: 0.6763 | Time: 2580.40s - 2643.68s

does not have access to a movie theater. The experience where for $20 a movie, a family of three or four can be entertained in a safe environment with air conditioning and food if you choose to Yeah. access food and

In [53]:
from datasets import load_dataset

dataset = load_dataset("vwxyzjn/summarize_from_feedback_tldr_3_filtered", split="train")


README.md:   0%|          | 0.00/261 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train.jsonl:   0%|          | 0.00/186M [00:00<?, ?B/s]

valid.jsonl:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/10.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/116722 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6447 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6553 [00:00<?, ? examples/s]

In [54]:
dataset[0]  # Shows the first example


{'id': 't3_1hxu8s',
 'subreddit': 'relationships',
 'title': 'I (f/22) have to figure out if I want to still know these girls or not and would hate to sound insulting',
 'post': "Not sure if this belongs here but it's worth a try. \n\nBackstory:\nWhen I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand  it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact. \n\nNow: Its been almost 3 years now and I've gotten better after counselling and mild anti d

In [None]:
def format_for_t5(example):
    return {
        "input": f"Summarize this: {example['post']}",
        "output": example['summary']
    }

# Apply formatting to each split
formatted_dataset = dataset.map(format_for_t5)


Map:   0%|          | 0/116722 [00:00<?, ? examples/s]

In [59]:
print(formatted_dataset)

Dataset({
    features: ['id', 'subreddit', 'title', 'post', 'summary', 'input', 'output'],
    num_rows: 116722
})


In [60]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

# Step 1: Load the base model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

# Step 2: Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],  # or ["q_proj", "v_proj"] depending on the model
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM  # Since we're doing summarization
)

# Step 3: Apply LoRA to the model
model = get_peft_model(model, lora_config)


config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [61]:
def tokenize_function(example):
    # Tokenize input without truncating
    model_inputs = tokenizer(
        example["input"],
        padding="longest",   # or "max_length" if batching later
        truncation=False     # disables cutting off long inputs
    )

    # Tokenize output with reasonable max_length (still keep short summaries)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["output"],
            padding="longest",   # to pad to longest example in batch
            truncation=True,
            max_length=128       # summaries should remain concise
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [94]:
tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/116722 [00:00<?, ? examples/s]

  self._in_target_context_manager = False


In [95]:
tokenized_dataset = tokenized_dataset[:1000] # Limit to 1000 examples for faster training/testing

In [97]:
from datasets import Dataset, DatasetDict

# Convert dict to HuggingFace Dataset
tokenized_hf_dataset = Dataset.from_dict(tokenized_dataset)

# 90% train, 10% validation
dataset_split = tokenized_hf_dataset.train_test_split(test_size=0.1, seed=42)

# Wrap into a DatasetDict for easier access later
dataset_split = DatasetDict({
    "train": dataset_split["train"],
    "eval": dataset_split["test"]
})

# Quick check
print(dataset_split)


DatasetDict({
    train: Dataset({
        features: ['id', 'subreddit', 'title', 'post', 'summary', 'input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 900
    })
    eval: Dataset({
        features: ['id', 'subreddit', 'title', 'post', 'summary', 'input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})


In [98]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

# Create train and eval dataloaders
train_dataloader = DataLoader(
    dataset_split["train"],
    batch_size=4,  # You can adjust depending on your GPU
    shuffle=True,
    collate_fn=default_data_collator
)

eval_dataloader = DataLoader(
    dataset_split["eval"],
    batch_size=4,
    collate_fn=default_data_collator
)


In [99]:
import torch
from torch.optim import AdamW

# Automatically choose GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the selected device
model.to(device)

# Use AdamW optimizer for fine-tuning transformers
optimizer = AdamW(model.parameters(), lr=2e-4)


In [100]:
# Get only the keys that matter
columns_to_keep = ["input_ids", "attention_mask", "labels"]

# Remove all other columns
dataset_split = dataset_split.remove_columns(
    [col for col in dataset_split["train"].column_names if col not in columns_to_keep]
)


In [101]:
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

train_dataloader = DataLoader(
    dataset_split["train"],
    batch_size=4,
    shuffle=True,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    dataset_split["eval"],
    batch_size=4,
    collate_fn=data_collator
)


In [102]:
from tqdm import tqdm

model.train()
for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if step % 100 == 0:
        print(f"Step {step} - Loss: {loss.item():.4f}")


Training:   0%|          | 1/225 [00:09<33:52,  9.08s/it]

Step 0 - Loss: 3.8051


Training:  45%|████▍     | 101/225 [16:29<35:28, 17.17s/it]

Step 100 - Loss: 3.2303


Training:  89%|████████▉ | 201/225 [36:42<03:28,  8.70s/it]

Step 200 - Loss: 2.9209


Training: 100%|██████████| 225/225 [39:58<00:00, 10.66s/it]


In [105]:
import evaluate
from tqdm import tqdm

# Load ROUGE metric
rouge = evaluate.load("rouge")

model.eval()
predictions = []
references = []

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    # Move to device
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Generate summaries
    with torch.no_grad():
        outputs = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_new_tokens=100
        )
    
    # Decode
    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

    # Store for metric
    predictions.extend(decoded_preds)
    references.extend(decoded_labels)

# Compute ROUGE scores
results = rouge.compute(predictions=predictions, references=references)

# Print results
for k, v in results.items():
    print(f"{k}: {v:.4f}")


Evaluating: 100%|██████████| 25/25 [01:57<00:00,  4.71s/it]


rouge1: 0.2644
rouge2: 0.0772
rougeL: 0.2018
rougeLsum: 0.2020


In [106]:
# Save the LoRA adapter weights
model.save_pretrained("../models/flan_t5_lora_summary")

# Save the tokenizer for future use
tokenizer.save_pretrained("../models/flan_t5_lora_summary")


('../models/flan_t5_lora_summary\\tokenizer_config.json',
 '../models/flan_t5_lora_summary\\special_tokens_map.json',
 '../models/flan_t5_lora_summary\\tokenizer.json')

In [108]:
model.eval()  # Set model to inference mode

for i, chunk in enumerate(ranked_chunks[:20]):
    input_text = f"Summarize this: {chunk['text']}"

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(model.device)

    # Generate summary
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=100,  # adjust if needed
            num_beams=4,
            early_stopping=True
        )

    # Decode output
    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print result
    print(f"\n🧩 Chunk {i+1} | Time: {chunk['start']:.2f}s - {chunk['end']:.2f}s")
    print("📜 Summary:", summary)



🧩 Chunk 1 | Time: 62.00s - 124.40s
📜 Summary: I love being in India so much and it's I never get enough time here. What have you done so far?

🧩 Chunk 2 | Time: 6824.48s - 6880.44s
📜 Summary: I've watched the first two watched the first two episodes so far. What do you think of it?

🧩 Chunk 3 | Time: 2580.40s - 2643.68s
📜 Summary: I don't think movie theaters are doing as well as they were doing five or 10 years ago.

🧩 Chunk 4 | Time: 6210.88s - 6274.40s
📜 Summary: I'm sure that you have learned some non-obvious insights you have learned that I won't know.

🧩 Chunk 5 | Time: 5596.40s - 5656.32s
📜 Summary: I'm going to try it. Can you take spice?

🧩 Chunk 6 | Time: 2028.56s - 2092.80s
📜 Summary: You're saying Bollywood should not try and sell or make the stories of Hollywood and Hollywood should not try and make the stories of Bollywood.

🧩 Chunk 7 | Time: 4737.28s - 4801.20s
📜 Summary: India and what is not working for you in India?

🧩 Chunk 8 | Time: 4859.16s - 4922.16s
📜 Summary: I