In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

## Ingestion

In [1]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import minsearch
import pandas as pd
from tqdm.auto import tqdm
import json

In [2]:
from openai import OpenAI
client = OpenAI()

In [3]:
def generate_transcript(video_id):
    try:
        # Check available transcripts for the video
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        transcript = transcript_list.find_transcript([t.language_code for t in transcript_list])
        
        language_code = transcript.language_code
        language = transcript.language

        print(f"Pulling transcript for video {video_id} in {language}")
        # Fetch the transcript text
        fetched_transcript = transcript.fetch()
        
    except TranscriptsDisabled:
        return "Subtitles are disabled for this video."
    except NoTranscriptFound:
        return "No transcript available in any language."
    except Exception as e:
        return str(e)

    metadata = {
        "language_code": language_code,
        "language": language,
        "generated": True

    }
    return fetched_transcript, metadata

In [4]:
clean_transcript_prompt = '''
You are a professional editor with expertise in data science. Transform the following podcast transcript into clear, readable text while preserving all original information.

Instructions:

1. Include All Content: Ensure the entire transcript content appears in the final output. Do not omit any key parts.
2. Remove Filler Words and Sounds: Eliminate filler words like "so," "right," "like" when they add no value. Remove any hums, "mhms," or similar sounds.
3. Enhance Sentence Clarity: Rephrase sentences for clarity and grammatical correctness:
- Avoid starting sentences with conjunctions like "And."
- Reframe any sentences that end with "right" into questions if possible.
4. Structure in Paragraphs: Use a clear paragraph structure:
- Logical Breaks: Begin a new paragraph at the end of each thought to enhance readability.
- Paragraph Length: Limit paragraphs to 5-6 sentences for better flow.
5. Major Topic Shifts: For noticeable shifts in topic, insert a separator ~~ between blocks of paragraphs.
6. Subtitles and Conclusion:
- Subtitles: Start each major section with a subtitle summarizing the main topic.
- Conclusion: Add a "Conclusion" subtitle summarizing key points from all sections, not just the final one. Ensure the summary covers main ideas and themes across the transcript.

Output format : 
[
    {{
        "subtitle": "<subtitle>",
        "text": "Sentence 1 of paragraph 1. Sentence 2 of paragraph 1. Sentence 3 of paragraph 1..."
    }},
    {{
        "subtitle": "<subtitle>",
        "text": "Sentence 1 of paragraph 1. Sentence 2 of paragraph 1. Sentence 3 of paragraph 1..."
    }},
    ........
    {{
        "subtitle": "<Conclusion>",
        "text": "A comprehensive summary capturing the main ideas, what is the video about, and themes discussed across all sections."

    }}
]
transcript:
{transcript}
'''

def get_clean_transcript_json_formated(transcript, model='gpt-4o-mini'):
    transcript_chunk = [chunk['text'] for chunk in transcript]
    prompt = clean_transcript_prompt.format(transcript=transcript_chunk)
    response = client.chat.completions.create(
        model = model,
        messages=[{"role":"user", "content": prompt}]
    )

    return response.choices[0].message.content

In [5]:
import os
# Directory to save individual transcript files
TRANSCRIPTS_DIR = "../data/transcripts/"

# Ensure the directory exists
os.makedirs(TRANSCRIPTS_DIR, exist_ok=True)

# Helper function to construct the unique file path for a video transcript
def get_transcript_file_path(video_id):
    filename = f"{video_id}.csv"
    return os.path.join(TRANSCRIPTS_DIR, filename)

def sanitize_video_id(video_id):
    """Convert hyphens in video ID to underscores for file compatibility."""
    return video_id.replace('-', '_')

# Function to check if a transcript file exists for the specified video
def check_existing_transcript(video_id):
    file_path = get_transcript_file_path(video_id)
    
    # Check if the file exists, and if so, return the content as a list of dictionaries
    if os.path.exists(file_path):
        return pd.read_csv(file_path).to_dict(orient='records')
    else:
        return None

# Function to save transcript chunks to a uniquely named CSV file
def save_transcript_chunks(video_id, chunks):
    file_path = get_transcript_file_path(video_id)
    chunks_df = pd.DataFrame(chunks)
    chunks_df.to_csv(file_path, index=False)


In [6]:
def generate_chunked_transcript(video_id, max_chunk_length=250, overlap_length=50, min_chunk_length=100):
    
    # Check if the file already exists based on actual language metadata
    existing_transcript = check_existing_transcript(video_id)
    if existing_transcript:
        metadata = {
            "language_code": existing_transcript[0]['language_code'],
            "language": existing_transcript[0]['language'],
            "generated": False
        }
        print(f"Transcript already generated for video ID.")
        return existing_transcript, metadata

    # Generate transcript only if it does not already exist
    raw_transcript, metadata = generate_transcript(video_id)
    if isinstance(raw_transcript, str):  # If the transcript generation returns an error message, return it
        return raw_transcript
    
    # Generate clean transcript
    clean_transcript = get_clean_transcript_json_formated(raw_transcript)
    transcript = json.loads(clean_transcript)
    # Split the transcript into chunks
    
    chunks = []
    chunk_id = 0

    for section in transcript:
        subtitle = section.get("subtitle", "")
        text = section.get("text", "").replace("\n", " ").replace("\t", " ")
        words = text.split(" ")
        current_chunk = ""

        for word in words:
            if len(current_chunk) + len(word) + 1 > max_chunk_length:
                if current_chunk and len(current_chunk) >= min_chunk_length:
                    overlap_words = current_chunk.split()[-(overlap_length // 5):]
                    overlap_part = " ".join(overlap_words)
                    chunks.append({
                        "video_id": sanitize_video_id(video_id),
                        "language_code": metadata['language_code'],
                        "language": metadata['language'],                        
                        "subtitle": subtitle,
                        "chunk_id": chunk_id,
                        "text_chunk": current_chunk.strip()
                    })
                    chunk_id += 1
                    current_chunk = overlap_part + " " + word
                else:
                    current_chunk += " " + word
            else:
                current_chunk += " " + word

        if current_chunk.strip():
            chunks.append({
                "video_id": sanitize_video_id(video_id),
                "language_code": metadata['language_code'],
                "language": metadata['language'],                        
                "subtitle": subtitle,
                "chunk_id": chunk_id,
                "text_chunk": current_chunk.strip()
            })
            chunk_id += 1
    
    # Save chunks to file using actual `original_language`
    save_transcript_chunks(video_id, chunks)
    print(f"Transcript generated successfully for video ID {video_id}.")

    
    return chunks, metadata


In [15]:
chunked_transcript,metadata = generate_chunked_transcript('L2GKmEH-gdg')

Transcript already generated for video ID.


In [8]:
metadata

{'language_code': 'id',
 'language': 'Indonesian (auto-generated)',
 'generated': False}

In [22]:
index = None

In [26]:
import minsearch

def initialize_and_load_index(chunked_transcript=None):
    """
    Initializes the global index if it is not already an instance of the Index class, and load it if it is already initialized
    
    Returns:
        Index: An instance of the Index class.
    """
    global index
    #initialize index if not initialize
    if not isinstance(index, minsearch.Index):
        index = minsearch.Index(
            text_fields=['subtitle', 'text_chunk'],
            keyword_fields=['video_id', 'chunk_id', 'language_code', 'language']
        )
        if chunked_transcript is not None:
            video_id = chunked_transcript[0]['video_id']
            index.fit(chunked_transcript)
            print(f"{video_id} are indexed successfully.")

    else:
        if chunked_transcript is not None:
            video_id = chunked_transcript[0]['video_id']
            index.fit(chunked_transcript)
            print(f"{video_id} are indexed successfully.")
    return index

In [27]:
index = initialize_and_load_index(chunked_transcript)

L2GKmEH_gdg are indexed successfully.


In [28]:
def search(query,video_id):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={"video_id": sanitize_video_id(video_id)},
        boost_dict=boost,
        num_results=5
    )
    return results

In [29]:
search("video ini tentang apa?", "L2GKmEH-gdg")

[{'video_id': 'L2GKmEH_gdg',
  'language_code': 'id',
  'language': 'Indonesian (auto-generated)',
  'subtitle': 'Keterpurukan di Liga Champions',
  'chunk_id': 11,
  'text_chunk': 'fans geram, dan pemain berusia 25 tahun ini disebut sebagai faktor utama kekalahan tim. Kritikan juga menyebut Mbappé sebagai pemain yang malas karena kurang berkontribusi saat bertahan.'},
 {'video_id': 'L2GKmEH_gdg',
  'language_code': 'id',
  'language': 'Indonesian (auto-generated)',
  'subtitle': 'Kekalahan Melawan AC Milan',
  'chunk_id': 13,
  'text_chunk': 'setelah mencetak gol. Hasil negatif ini memperpanjang rekor buruk Los Blancos di Liga Champions. Performanya yang kurang baik membuat Real Madrid semakin kesulitan untuk mempertahankan gelar, meskipun mereka diprediksi sebagai salah satu unggulan'},
 {'video_id': 'L2GKmEH_gdg',
  'language_code': 'id',
  'language': 'Indonesian (auto-generated)',
  'subtitle': 'Kritik terhadap Mbappé',
  'chunk_id': 4,
  'text_chunk': 'setiap gelar musim ini. Nam

In [124]:
prompt_template = """
You're a video assistant. Answer the QUESTION based on the CONTEXT from our YouTube transcript chunks.
- If the QUESTION and CONTEXT are in different languages, translate the QUESTION to match the CONTEXT language before answering.
- Use only the information from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
subtitle: {subtitle}
text_chunk: {text_chunk}
""".strip()

def build_prompt(query, transcript_chunks):
    context = ""
    
    for chunk in transcript_chunks:
        context += entry_template.format(**chunk) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [125]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model = model,
        messages=[{"role":"user", "content": prompt}]
    )

    return response.choices[0].message.content


In [126]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer


In [143]:
query = 'video ini tentang apa?'
rag(query)

'Silakan berikan potongan transkrip dari video tersebut agar saya dapat membantu menjawab pertanyaan tentang isi video ini.'

In [66]:
df = pd.DataFrame(chunked_transcript)

df.to_csv('../data/transcript_chunk.csv')

## Retrieval evaluation

In [68]:
df_question = pd.read_csv("../data/ground-truth-retrieval.csv")

In [69]:
df_question.head()

Unnamed: 0,id,question
0,0,What is the purpose of the office hours sessio...
1,0,How many people have joined the office hours?
2,0,Who asked the first question during the office...
3,0,How long has it been since the last office hou...
4,0,What greeting does the host use to welcome par...


In [70]:
ground_truth = df_question.to_dict(orient='records')

In [71]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [72]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [73]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    
    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['chunk_id']== doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }


In [74]:
evaluate(ground_truth,lambda q: minsearch_search(q['question']))

  0%|          | 0/205 [00:00<?, ?it/s]

{'hit_rate': 0.9219512195121952, 'mrr': 0.5398993418505613}

## Finding best parameters

In [75]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [76]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf') # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        #generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(mini_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        #evaluate the objective function
        current_score = objective_function(current_params)

        # Update best if current is better
        if current_score > best_score: # Change to > if maximizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score

In [77]:
gt_val = df_validation.to_dict(orient='records')

In [78]:
def minsearch_search(query, video_id,boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={"video_id": video_id},
        boost_dict=boost,
        num_results=10
    )

    return results

In [79]:
param_ranges = {
    'subtitle': (0.0, 3.0),
    'original_language': (0.0, 1.0),
    'translate_language': (0.0, 1.0),
    'chunk_text': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], 'pA9S1mTqAwU',boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [89]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'subtitle': 0.14229395552278223,
  'original_language': 0.69032315975889,
  'translate_language': 0.05899131672045521,
  'chunk_text': 0.9115689008437606},
 0.8157301587301587)

In [90]:
def minsearch_improved(query):
    boost = {
        'subtitle': 0.14229395552278223,
        'original_language': 0.69032315975889,
        'translate_language': 0.05899131672045521,
        'chunk_text': 0.9115689008437606
    }

    results = index.search(
        query=query,
        filter_dict={},
        num_results=10
    )
    
    return results

In [91]:
evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/205 [00:00<?, ?it/s]

{'hit_rate': 0.9219512195121952, 'mrr': 0.5398993418505613}

## RAG evaluation

In [92]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [93]:
len(ground_truth)

205

In [94]:
record = ground_truth[0]

In [95]:
answer_llm = rag(record['question'])

In [96]:
print(answer_llm)

El propósito de la sesión de horas de oficina mencionada en la transcripción es responder a las preguntas de los estudiantes, especialmente sobre los proyectos, aunque también se les invita a hacer preguntas que no estén directamente relacionadas con los proyectos.


In [97]:
prompt = prompt2_template.format(question=record['question'], answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the purpose of the office hours session mentioned in the transcript?
Generated Answer: El propósito de la sesión de horas de oficina mencionada en la transcripción es responder a las preguntas de los estudiantes, especialmente sobre los proyectos, aunque también se les invita a hacer preguntas que no estén directamente relacionadas con los proyectos.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [98]:
df_sample = df_question.sample(n=100, random_state=1)

In [99]:
sample = df_sample.to_dict(orient='records')

In [100]:
def rag(query, model='gpt-4o-mini'):
    search_results = minsearch_improved(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [101]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question)

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/100 [00:00<?, ?it/s]

In [102]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']


In [103]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.85
PARTLY_RELEVANT    0.14
NON_RELEVANT       0.01
Name: proportion, dtype: float64

In [133]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [135]:
df_eval[df_eval.relevance == 'PARTLY_RELEVANT'][:5]

Unnamed: 0,answer,id,question,relevance,explanation
3,The notable differences in evaluation criteria...,8,What are the notable differences in evaluation...,PARTLY_RELEVANT,The generated answer hints at the existence of...
7,The purpose of the pinned link in the chat is ...,3,What is the purpose of the pinned link in the ...,PARTLY_RELEVANT,The generated answer touches on the purpose of...
14,A common issue with YouTube video transcripts ...,17,What is a common issue with YouTube video tran...,PARTLY_RELEVANT,The generated answer identifies an issue relat...
16,Different metrics can be utilized to assess pe...,23,What types of metrics can be used for assessment?,PARTLY_RELEVANT,The generated answer discusses metrics related...
19,Project deliverables are crucial as they ensur...,31,Can you explain the importance of project deli...,PARTLY_RELEVANT,The answer discusses the importance of project...
