## Step 1
- Load the DataSet
- Organise Text Using Time Stamps
- Extract Question
- Extract Answer
- Extract Trancript

In [1]:
# loading the json data

import json

file_path = "../dataset.json"

# Open the file in read mode
with open(file_path, 'r') as json_file:
    # Load JSON data from the file
    data = json.load(json_file)

# Now 'data' contains the JSON data from the file

In [15]:
# Data Size
print(len(data))

2332


In [13]:
def group_elements_by_video_id(json_data):
    """
    Groups elements in JSON data based on the 'videoID' field.

    Args:
    json_data (dict): A dictionary containing JSON data where each key represents a JSON element.
                      Each JSON element should contain a 'videoID' key.

    Returns:
    dict: A dictionary where keys are unique 'videoID' values and values are lists of JSON elements with the same 'videoID'.
    """
    grouped_elements = {}
    for key, value in json_data.items():
        video_id = value.get("videoID")
        if video_id:
            if video_id not in grouped_elements:
                grouped_elements[video_id] = []
            grouped_elements[video_id].append(value)
    return grouped_elements

In [16]:
# Grouping data based on video id to see if tere are questions on same video
grouped_data = group_elements_by_video_id(data)

print(len(grouped_data))

2004


In [33]:
count = 0
for video_id, elements in grouped_data.items():
    if len(elements) > 1:
        count+=1
        print(f"Video ID: {video_id}")
print(count)

Video ID: 3ah9UPsY1IU
Video ID: LogWmfXfxgk
Video ID: c5AXj6afENc
Video ID: AXCh6rW2LbY
Video ID: EtVkwVKLc_M
Video ID: 5UJFTRWXDPo
Video ID: 7xDmCLP5mhY
Video ID: W0ZF5SnYTzM
Video ID: Ju7gza7ExhU
Video ID: 5xs8jcoHTwg
Video ID: ZkGfslseYWA
Video ID: J1DILo1Ewao
Video ID: YADYD28wIWU
Video ID: WActrrC9cr0
Video ID: AJtMlTnqyw0
Video ID: W8KJKwq2R3s
Video ID: CGmJbwvt3Hk
Video ID: MLdveEPhkXA
Video ID: T7mYB6x68DY
Video ID: krkIPLlMwgk
Video ID: TOlyjhgtTEc
Video ID: qgpPfSiLjr0
Video ID: SCWoI0X4jw0
Video ID: 9nff7C9vv1s
Video ID: u_M5MaxLL5U
Video ID: sv3TXMSv6Lw
Video ID: y0mfNWKBQp0
Video ID: 5Sa9nYKiYg0
Video ID: 7AGLY0Pgd94
Video ID: ZQv7MbbbUZw
Video ID: Oslnt9Zr9Jw
Video ID: UPqnb6fbW_o
Video ID: YQ9kGMas-Xc
Video ID: K18LPVV_71Q
Video ID: TglNG-yjabU
Video ID: VTnDslKW6y0
Video ID: WUbNIzWSNR4
Video ID: RRKMtAWjQ3U
Video ID: XWSJyNlAIRk
Video ID: 9JIPrE0aY2I
Video ID: WWsY8IaoAV8
Video ID: uF7uw3wb5xQ
Video ID: 3JKN2AV1qQQ
Video ID: 8fNzjUGbVT8
Video ID: 7rC80DafNWs
Video ID: 

In [24]:
def concat_captions_from_grouped_data(grouped_data):
    """
    Concatenates captions from the grouped data, taking one caption per group from the first element of each group.

    Args:
    grouped_data (dict): A dictionary where keys are unique 'videoID' values and values are lists of JSON elements with the same 'videoID'.

    Returns:
    dict: A dictionary where keys are unique 'videoID' values and values are concatenated captions from the first element of each group.
    """
    concatenated_captions = {}
    for video_id, elements in grouped_data.items():
        if elements:  # Ensure there are elements in the group
            first_element = elements[0]  # Get the first element of the group
            if first_element.get("caption"):
                caption_texts = [caption["text"] for caption in first_element["caption"]]
                concatenated_captions[video_id] = " ".join(caption_texts)
    return concatenated_captions

In [27]:
# Concatenate captions from the grouped data
concatenated_captions = concat_captions_from_grouped_data(grouped_data)

print(list(concatenated_captions.values())[0])

Hey Good morning Everybody today! I'm going to show you how to make a holiday treat that is a favorite for a lot of people and these features ridiculously easy to make and you don't have to bake them which is really nice for a change and it only takes a few ingredients. Okay, so we're gonna be making rum balls today and yes, I am using real rum for this and if you want to use the extract now I don't know how much you need. Basically what you'd have to do is just add some and you know, adjust it to taste. but the first thing to do is get your sweet condensed milk in a big bowl and combine it with the rum. I Always keep this stuff at room temperature because it's really hard to get out of the can if you have it in the refrigerator. so measure the rum quarter cup and mix that in. Just let's not sit, add the gram crumbs and the cocoa now. I Always use a spoon and measure the cocoa directly over the container to try to minimize the mess because powdered cocoa can make quite a mess. especial

In [41]:
def extract_questions_and_ocrs_per_video_id(grouped_data):
    """
    Extracts separate dictionaries for questions and OCRs per video ID from grouped data.

    Args:
    grouped_data (dict): A dictionary where keys are unique 'videoID' values and values are lists of JSON elements with the same 'videoID'.

    Returns:
    tuple: A tuple containing two dictionaries - one for questions and one for OCRs per video ID.
    """
    questions_per_video_id = {}
    ocrs_per_video_id = {}
    for video_id, elements in grouped_data.items():
        questions = [element["question"] for element in elements]
        ocrs = [element["ocr"] for element in elements]
        questions_per_video_id[video_id] = questions
        ocrs_per_video_id[video_id] = ocrs
    return questions_per_video_id, ocrs_per_video_id

In [42]:
# Extract questions and OCRs per video ID
questions, ocrs = extract_questions_and_ocrs_per_video_id(grouped_data)

print(list(questions.values())[0])
print(list(ocrs.values())[0])

['where is a list of the amounts of ingredients.  i would like to make them but i do not want to guess']
['300ml can sweetened condensed milk  400gm graham crumbs  1/4 cup dark rum  1/4 cup cocoa powder  200gm unsweetened  dessicated coconut']


## Step 2

1. Tokenize the text (both transcript and questions) into words or subwords using nltk
2. Remove stop words (both document and questions)
3. Perform stemming and lemmatization
4. Entity recognition or part-of-speech tagging

In [53]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/yogyach/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yogyach/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yogyach/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yogyach/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [44]:
def tokenize_captions(concatenated_captions_per_video_id):
    """
    Tokenizes concatenated captions per video ID into words using NLTK.

    Args:
    concatenated_captions_per_video_id (dict): A dictionary where keys are unique 'videoID' values and values are concatenated captions.

    Returns:
    dict: A dictionary where keys are unique 'videoID' values and values are tokenized caption lists.
    """
    nltk.download('punkt')
    tokenized_captions_per_video_id = {}
    for video_id, concatenated_caption in concatenated_captions_per_video_id.items():
        tokenized_captions_per_video_id[video_id] = word_tokenize(concatenated_caption)
    return tokenized_captions_per_video_id

In [45]:
def tokenize_questions(questions_per_video_id):
    """
    Tokenizes questions per video ID into words using NLTK.

    Args:
    questions_per_video_id (dict): A dictionary where keys are unique 'videoID' values and values are lists of questions.

    Returns:
    dict: A dictionary where keys are unique 'videoID' values and values are lists of tokenized questions.
    """
    nltk.download('punkt')
    tokenized_questions_per_video_id = {}
    for video_id, questions in questions_per_video_id.items():
        tokenized_questions_per_video_id[video_id] = [word_tokenize(question) for question in questions]
    return tokenized_questions_per_video_id

In [46]:
# Tokenize concatenated captions per video ID
tokenized_captions = tokenize_captions(concatenated_captions)
print("Tokenized captions per video ID:", tokenized_captions)

# Tokenize questions per video ID
tokenized_questions = tokenize_questions(questions)
print("Tokenized questions per video ID:", tokenized_questions)

[nltk_data] Downloading package punkt to /Users/yogyach/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

[nltk_data] Downloading package punkt to /Users/yogyach/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
def remove_stopwords(tokens):
    """
    Removes stop words from a list of tokens using NLTK's list of stopwords.

    Args:
    tokens (list): A list of tokens.

    Returns:
    list: A list of tokens with stop words removed.
    """
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [49]:
filtered_captions = remove_stopwords(tokenized_captions)

In [50]:
def perform_stemming(tokens):
    """
    Performs stemming on a list of tokens using NLTK's Porter Stemmer.

    Args:
    tokens (list): A list of tokens.

    Returns:
    list: A list of stemmed tokens.
    """
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

def perform_lemmatization(tokens):
    """
    Performs lemmatization on a list of tokens using NLTK's WordNet Lemmatizer.

    Args:
    tokens (list): A list of tokens.

    Returns:
    list: A list of lemmatized tokens.
    """
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

def perform_pos_tagging(tokens):
    """
    Performs part-of-speech tagging on a list of tokens using NLTK's POS Tagger.

    Args:
    tokens (list): A list of tokens.

    Returns:
    list: A list of tuples where each tuple contains a token and its corresponding part-of-speech tag.
    """
    pos_tags = pos_tag(tokens)
    return pos_tags


In [55]:
# Perform stemming
stemmed_text = perform_stemming(filtered_captions)
print("Stemmed text:", stemmed_text)

# Perform lemmatization
lemmatized_text = perform_lemmatization(stemmed_text)
print("Lemmatized text:", lemmatized_text)

# Perform part-of-speech tagging
pos_tagged_text = perform_pos_tagging(lemmatized_text)
print("Part-of-speech tagged text:", pos_tagged_text)

Stemmed text: ['lsm1v844z1e', 'x0jofzoydcq', 'nefdqw1ewuw', '1s8x3hexio8', 'jfnxpmt6h_i', 'cocmvmr8zi', 'wkj569sjayg', 'wekw3n8vrv8', 'dcx6qmrw57o', 'bthaywxtbdi', 'mwrhhiaj7nc', 'gfejhb5lfd', 'bwmni3wcitg', '0aeyr1b6fgw', 'n7al_xbgv5', '0_ad-pbjqg', 'hybgtiglmlw', 'powymedenhk', 'ujzr2prnedw', 'sbrxq_bcdl', 'h0nn6pjv_fc', 'b5ubotkth6m', 'frhq8c6jwj0', 'aayu2znslb0', 'w69noyjyrr8', 'g3zeufrn-og', '91jrtuuvlq', 'm4ehjmjjun8', '3ah9upsy1iu', '9hjulkhseo4', 'uf-h0v2lyeu', '79bgzj26ozm', '-tsnxdsq15q', 'coolgdg5vbi', 'w8ywcm6eg9', 'logwmfxfxgk', '80oov72cwsm', 'bwzebtwksoa', 'sqz46tu8u4k', '11_az8dxzi', 'uq2pxclzrw', 'dggvp6vshzg', 'cgih0zhtoqg', '8teusxkaw9q', 'shxhrhtgscc', 'ibkpe0izhf', 'xgdbs3b3rnk', '3a70sgbigau', 'tkze8prkie0', 'ck5abatnwno', 'xi0dab8jdm', 'cfii5m3-at0', '022y0pfkjdg', 'ovn1qmzwlu', 'ftfxzsfzixq', 'nacfue7svsa', 'hhzm523ly6', 'eijbvpnbpvu', 'v_7qbuzfzim', 'iqrbnog0vai', 'hwniejmic9k', '1xyzl8ekh-w', 'jrd5xaqfgaa', '8qcu5uyk4oq', 'fzmq0lmik0g', 'tdxqplp4lbi', '3ac7zgj

## Step 4

1. Embedding Generation: Convert the preprocessed text and questions into vector embeddings. (Pre-trained LLMs, word2vec)
2. Question-Document Matching: Take the document, embeddings and create a vector space. Now, Use techniques like cosine similarity or other similarity metrics to match the question embeddings with the document embeddings.


In [56]:
from transformers import BertModel, BertTokenizer
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [57]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # You can choose from various pre-trained BERT models
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

tokenizer_config.json: 100%|█████████████████| 28.0/28.0 [00:00<00:00, 28.7kB/s]
vocab.txt: 100%|█████████████████████████████| 232k/232k [00:00<00:00, 1.39MB/s]
tokenizer.json: 100%|████████████████████████| 466k/466k [00:00<00:00, 2.33MB/s]
config.json: 100%|█████████████████████████████| 570/570 [00:00<00:00, 1.63MB/s]
model.safetensors: 100%|█████████████████████| 440M/440M [00:22<00:00, 19.8MB/s]


In [59]:
def generating_embeddings(tokenizer, model, text, questions):
    text_tokens = tokenizer.encode(text, add_special_tokens=True)
    question_tokens = [tokenizer.encode(question, add_special_tokens=True) for question in questions]
    
    # Convert token IDs to tensors
    text_input_ids = torch.tensor(text_tokens).unsqueeze(0)  # Add batch dimension
    question_input_ids = [torch.tensor(question).unsqueeze(0) for question in question_tokens]  # Add batch dimension for each question
    
    # Generate embeddings for text
    with torch.no_grad():
        text_outputs = model(text_input_ids)
    text_embeddings = text_outputs[0]  # Extract hidden states
    
    # Generate embeddings for questions
    question_embeddings = []
    for question_input_id in question_input_ids:
        with torch.no_grad():
            question_outputs = model(question_input_id)
        question_embedding = question_outputs[0]  # Extract hidden states
        question_embeddings.append(question_embedding)
    return text_embeddings, question_embeddings

In [None]:
captions_embeddings = []
questions_embeddings = []
for i in len(concatenated_captions):
    text = list(concatenated_captions.values())[i]
    questions = list(questions.values())[i]
    text_embeddings, question_embeddings = generating_embeddings(tokenizer, model, text, questions)
    captions_embeddings.append()
    questions_embeddings.append()
    
    