#### This notebook is to test
1. If we give an existing pretrained model like BERT, the captions and the questions, would it answer the questions comparable to the results in the paper?
2. Is there any difference if we find the closest section of transcript and give that to the model?

## Step 1
- Load the DataSet
- Organise Text Using Time Stamps
- Extract Question
- Extract Answer
- Extract Trancript

In [59]:
# loading the json data

import json

file_path = "../dataset.json"

# Open the file in read mode
with open(file_path, 'r') as json_file:
    # Load JSON data from the file
    data = json.load(json_file)

# Now 'data' contains the JSON data from the file

In [60]:
# Data Size
print(len(data))

2332


In [61]:
def group_elements_by_video_id(json_data):
    """
    Groups elements in JSON data based on the 'videoID' field.

    Args:
    json_data (dict): A dictionary containing JSON data where each key represents a JSON element.
                      Each JSON element should contain a 'videoID' key.

    Returns:
    dict: A dictionary where keys are unique 'videoID' values and values are lists of JSON elements with the same 'videoID'.
    """
    grouped_elements = {}
    for key, value in json_data.items():
        video_id = value.get("videoID")
        if video_id:
            if video_id not in grouped_elements:
                grouped_elements[video_id] = []
            grouped_elements[video_id].append(value)
    return grouped_elements

In [62]:
# Grouping data based on video id to see if tere are questions on same video
grouped_data = group_elements_by_video_id(data)

print(len(grouped_data))

2004


In [63]:
count = 0
for video_id, elements in grouped_data.items():
    if len(elements) > 1:
        count+=1
        #print(f"Video ID: {video_id}")
print(count)

211


In [64]:
def concat_captions_from_grouped_data(grouped_data):
    """
    Concatenates captions from the grouped data, taking one caption per group from the first element of each group.

    Args:
    grouped_data (dict): A dictionary where keys are unique 'videoID' values and values are lists of JSON elements with the same 'videoID'.

    Returns:
    dict: A dictionary where keys are unique 'videoID' values and values are concatenated captions from the first element of each group.
    """
    concatenated_captions = {}
    for video_id, elements in grouped_data.items():
        if elements:  # Ensure there are elements in the group
            first_element = elements[0]  # Get the first element of the group
            if first_element.get("caption"):
                caption_texts = [caption["text"] for caption in first_element["caption"]]
                concatenated_captions[video_id] = " ".join(caption_texts)
    return concatenated_captions

In [65]:
# Concatenate captions from the grouped data
concatenated_captions = concat_captions_from_grouped_data(grouped_data)

print(list(concatenated_captions.values())[0])

Hey Good morning Everybody today! I'm going to show you how to make a holiday treat that is a favorite for a lot of people and these features ridiculously easy to make and you don't have to bake them which is really nice for a change and it only takes a few ingredients. Okay, so we're gonna be making rum balls today and yes, I am using real rum for this and if you want to use the extract now I don't know how much you need. Basically what you'd have to do is just add some and you know, adjust it to taste. but the first thing to do is get your sweet condensed milk in a big bowl and combine it with the rum. I Always keep this stuff at room temperature because it's really hard to get out of the can if you have it in the refrigerator. so measure the rum quarter cup and mix that in. Just let's not sit, add the gram crumbs and the cocoa now. I Always use a spoon and measure the cocoa directly over the container to try to minimize the mess because powdered cocoa can make quite a mess. especial

In [66]:
def extract_questions_and_ocrs_per_video_id(grouped_data):
    """
    Extracts separate dictionaries for questions and OCRs per video ID from grouped data.

    Args:
    grouped_data (dict): A dictionary where keys are unique 'videoID' values and values are lists of JSON elements with the same 'videoID'.

    Returns:
    tuple: A tuple containing two dictionaries - one for questions and one for OCRs per video ID.
    """
    questions_per_video_id = {}
    ocrs_per_video_id = {}
    for video_id, elements in grouped_data.items():
        questions = [element["question"] for element in elements]
        ocrs = [element["ocr"] for element in elements]
        questions_per_video_id[video_id] = questions
        ocrs_per_video_id[video_id] = ocrs
    return questions_per_video_id, ocrs_per_video_id

In [105]:
# Extract questions and OCRs per video ID
questions, ocrs = extract_questions_and_ocrs_per_video_id(grouped_data)

print(list(questions.values())[0])
print(list(ocrs.values())[0])

['where is a list of the amounts of ingredients.  i would like to make them but i do not want to guess']
['300ml can sweetened condensed milk  400gm graham crumbs  1/4 cup dark rum  1/4 cup cocoa powder  200gm unsweetened  dessicated coconut']


## Question 1


In [119]:
from transformers import BertForQuestionAnswering, BertTokenizer

# Load pre-trained BERT model and tokenizer
model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'  # Pre-trained model fine-tuned on SQuAD
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

def answer_question(question, text):
    # Tokenize input question and text
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt", max_length=512, truncation=True)

    # Perform question answering
    start_scores, end_scores = model(**inputs,return_dict=False)

    # Get the most likely answer
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end+1]))
    return answer

# Example text and questions
transcript = list(concatenated_captions.values())[0]
queries = list(questions.values())[0]

# Answer each question based on the transcript
for query in queries:
    answer = answer_question(query, transcript)
    print(f"Question: {query}")
    print(f"Answer: {answer}\n")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Question: where is a list of the amounts of ingredients.  i would like to make them but i do not want to guess
Answer: [CLS] where is a list of the amounts of ingredients . i would like to make them but i do not want to guess [SEP] hey good morning everybody today ! i ' m going to show you how to make a holiday treat that is a favorite for a lot of people and these features ridiculously easy to make and you don ' t have to bake them which is really nice for a change and it only takes a few ingredients . okay , so we ' re gonna be making rum balls today and yes , i am using real rum for this and if you want to use the extract now i don ' t know how much you need . basically what you ' d have to do is just add some and you know , adjust it to taste . but the first thing to do is get your sweet condensed milk in a big bowl and combine it with the rum . i always keep this stuff at room temperature because it ' s really hard to get out of the can if you have it in the refrigerator . so meas

## Step 2

1. Tokenize the text (both transcript and questions) into words or subwords using nltk
2. Remove stop words (both document and questions)
3. Perform stemming and lemmatization
4. Entity recognition or part-of-speech tagging

In [53]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/yogyach/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yogyach/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yogyach/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yogyach/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [44]:
def tokenize_captions(concatenated_captions_per_video_id):
    """
    Tokenizes concatenated captions per video ID into words using NLTK.

    Args:
    concatenated_captions_per_video_id (dict): A dictionary where keys are unique 'videoID' values and values are concatenated captions.

    Returns:
    dict: A dictionary where keys are unique 'videoID' values and values are tokenized caption lists.
    """
    nltk.download('punkt')
    tokenized_captions_per_video_id = {}
    for video_id, concatenated_caption in concatenated_captions_per_video_id.items():
        tokenized_captions_per_video_id[video_id] = word_tokenize(concatenated_caption)
    return tokenized_captions_per_video_id

In [45]:
def tokenize_questions(questions_per_video_id):
    """
    Tokenizes questions per video ID into words using NLTK.

    Args:
    questions_per_video_id (dict): A dictionary where keys are unique 'videoID' values and values are lists of questions.

    Returns:
    dict: A dictionary where keys are unique 'videoID' values and values are lists of tokenized questions.
    """
    nltk.download('punkt')
    tokenized_questions_per_video_id = {}
    for video_id, questions in questions_per_video_id.items():
        tokenized_questions_per_video_id[video_id] = [word_tokenize(question) for question in questions]
    return tokenized_questions_per_video_id

In [46]:
# Tokenize concatenated captions per video ID
tokenized_captions = tokenize_captions(concatenated_captions)
print("Tokenized captions per video ID:", tokenized_captions)

# Tokenize questions per video ID
tokenized_questions = tokenize_questions(questions)
print("Tokenized questions per video ID:", tokenized_questions)

[nltk_data] Downloading package punkt to /Users/yogyach/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

[nltk_data] Downloading package punkt to /Users/yogyach/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
def remove_stopwords(tokens):
    """
    Removes stop words from a list of tokens using NLTK's list of stopwords.

    Args:
    tokens (list): A list of tokens.

    Returns:
    list: A list of tokens with stop words removed.
    """
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [49]:
filtered_captions = remove_stopwords(tokenized_captions)

In [50]:
def perform_stemming(tokens):
    """
    Performs stemming on a list of tokens using NLTK's Porter Stemmer.

    Args:
    tokens (list): A list of tokens.

    Returns:
    list: A list of stemmed tokens.
    """
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

def perform_lemmatization(tokens):
    """
    Performs lemmatization on a list of tokens using NLTK's WordNet Lemmatizer.

    Args:
    tokens (list): A list of tokens.

    Returns:
    list: A list of lemmatized tokens.
    """
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

def perform_pos_tagging(tokens):
    """
    Performs part-of-speech tagging on a list of tokens using NLTK's POS Tagger.

    Args:
    tokens (list): A list of tokens.

    Returns:
    list: A list of tuples where each tuple contains a token and its corresponding part-of-speech tag.
    """
    pos_tags = pos_tag(tokens)
    return pos_tags


In [55]:
# Perform stemming
stemmed_text = perform_stemming(filtered_captions)
print("Stemmed text:", stemmed_text)

# Perform lemmatization
lemmatized_text = perform_lemmatization(stemmed_text)
print("Lemmatized text:", lemmatized_text)

# Perform part-of-speech tagging
pos_tagged_text = perform_pos_tagging(lemmatized_text)
print("Part-of-speech tagged text:", pos_tagged_text)

Stemmed text: ['lsm1v844z1e', 'x0jofzoydcq', 'nefdqw1ewuw', '1s8x3hexio8', 'jfnxpmt6h_i', 'cocmvmr8zi', 'wkj569sjayg', 'wekw3n8vrv8', 'dcx6qmrw57o', 'bthaywxtbdi', 'mwrhhiaj7nc', 'gfejhb5lfd', 'bwmni3wcitg', '0aeyr1b6fgw', 'n7al_xbgv5', '0_ad-pbjqg', 'hybgtiglmlw', 'powymedenhk', 'ujzr2prnedw', 'sbrxq_bcdl', 'h0nn6pjv_fc', 'b5ubotkth6m', 'frhq8c6jwj0', 'aayu2znslb0', 'w69noyjyrr8', 'g3zeufrn-og', '91jrtuuvlq', 'm4ehjmjjun8', '3ah9upsy1iu', '9hjulkhseo4', 'uf-h0v2lyeu', '79bgzj26ozm', '-tsnxdsq15q', 'coolgdg5vbi', 'w8ywcm6eg9', 'logwmfxfxgk', '80oov72cwsm', 'bwzebtwksoa', 'sqz46tu8u4k', '11_az8dxzi', 'uq2pxclzrw', 'dggvp6vshzg', 'cgih0zhtoqg', '8teusxkaw9q', 'shxhrhtgscc', 'ibkpe0izhf', 'xgdbs3b3rnk', '3a70sgbigau', 'tkze8prkie0', 'ck5abatnwno', 'xi0dab8jdm', 'cfii5m3-at0', '022y0pfkjdg', 'ovn1qmzwlu', 'ftfxzsfzixq', 'nacfue7svsa', 'hhzm523ly6', 'eijbvpnbpvu', 'v_7qbuzfzim', 'iqrbnog0vai', 'hwniejmic9k', '1xyzl8ekh-w', 'jrd5xaqfgaa', '8qcu5uyk4oq', 'fzmq0lmik0g', 'tdxqplp4lbi', '3ac7zgj

## Step 3

1. Embedding Generation: Convert the preprocessed text and questions into vector embeddings. (Pre-trained LLMs, word2vec)
2. Question-Document Matching: Take the document, embeddings and create a vector space. Now, Use techniques like cosine similarity or other similarity metrics to match the question embeddings with the document embeddings.


In [15]:
from transformers import BertModel, BertTokenizer
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # You can choose from various pre-trained BERT models
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

tokenizer_config.json: 100%|█████████████████| 48.0/48.0 [00:00<00:00, 31.7kB/s]


In [71]:
def generating_embeddings(tokenizer, model, text, questions=None):
    """
    Generates embeddings for a given text using a pre-trained BERT model.

    Args:
    tokenizer (PreTrainedTokenizer): Pre-trained BERT tokenizer.
    model (PreTrainedModel): Pre-trained BERT model.
    text (str): Input text for which embeddings are to be generated.
    questions (list): List of question strings.

    Returns:
    torch.Tensor: Embedding vector representing the input text.
    """
    # Tokenize the input text and questions
    if questions is None:
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    else:
        inputs = tokenizer(text, questions, padding=True, truncation=True, return_tensors="pt")

    # Forward pass through the model to get the embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings from the model outputs
    # For simplicity, we'll just average the embeddings of all tokens
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    
    return embeddings

In [58]:
# Initialize dictionaries to store embeddings
text_embeddings_dict = {}
question_embeddings_dict = {}

# Assuming concatenated_captions and questions have the same keys
for video_id in concatenated_captions.keys():
    text = concatenated_captions[video_id]  # Get the concatenated captions for the current video ID
    question_list = questions[video_id]     # Get the list of questions for the current video ID

    # Generate embeddings for the text
    text_embedding = generating_embeddings(tokenizer, model, text, None)
    text_embeddings_dict[video_id] = text_embedding.cpu().numpy().tolist()  # Convert tensor to numpy array, then to list
    
    # Generate embeddings for each question
    question_embeddings = []
    for question in question_list:
        question_embedding = generating_embeddings(tokenizer, model, question, None)
        question_embeddings.append(question_embedding.cpu().numpy().tolist())  # Convert tensor to numpy array, then to list
    question_embeddings_dict[video_id] = question_embeddings


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [72]:
"""# Write text embeddings to a JSON file
with open('text_embeddings_base_uncased.json', 'w') as text_file:
    json.dump(text_embeddings_dict, text_file)

# Write question embeddings to a JSON file
with open('question_embeddings_base_uncased.json', 'w') as question_file:
    json.dump(question_embeddings_dict, question_file)"""

In [87]:
# Load text embeddings
with open('text_embeddings_base_uncased.json', 'r') as f:
    text_embeddings = json.load(f)

# Load question embeddings
with open('question_embeddings_base_uncased.json', 'r') as f:
    question_embeddings = json.load(f)

In [101]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute cosine similarities
for video_id, text_embedding in text_embeddings.items():
    if video_id in question_embeddings:
        question_similarities = []
        for question_embedding in question_embeddings[video_id]:
            text_embedding_np = np.array(text_embedding[0])
            question_embedding_np = np.array(question_embedding[0])
            similarity = cosine_similarity([text_embedding_np], [question_embedding_np])[0][0]
            question_similarities.append(similarity)
        cosine_similarities_dict[video_id] = question_similarities


# Print or use cosine_similarities_dict as needed
print(cosine_similarities_dict)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [None]:
import torch.nn.functional as F

def cosine_similarity(embedding1, embedding2):
    """
    Computes the cosine similarity between two embeddings.

    Args:
    embedding1 (torch.Tensor): Embedding tensor 1.
    embedding2 (torch.Tensor): Embedding tensor 2.

    Returns:
    float: Cosine similarity between the two embeddings.
    """
    return F.cosine_similarity(embedding1, embedding2).item()

def l2_norm(embedding1, embedding2):
    """
    Computes the L2 norm (Euclidean distance) between two embeddings.

    Args:
    embedding1 (torch.Tensor): Embedding tensor 1.
    embedding2 (torch.Tensor): Embedding tensor 2.

    Returns:
    float: L2 norm (Euclidean distance) between the two embeddings.
    """
    return torch.dist(embedding1, embedding2).item()

# Example usage:
# Assuming you have obtained embeddings for both questions and the document
question_embedding = torch.randn(768)  # Example question embedding (768-dimensional)
document_embedding = torch.randn(768)  # Example document embedding (768-dimensional)

# Compute cosine similarity
cos_sim = cosine_similarity(question_embedding, document_embedding)
print("Cosine similarity:", cos_sim)

# Compute L2 norm
l2_distance = l2_norm(question_embedding, document_embedding)
print("L2 norm (Euclidean distance):", l2_distance)

In [None]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Load pre-trained BERT model and tokenizer for QA
qa_model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'  # Example QA model
qa_tokenizer = BertTokenizer.from_pretrained(qa_model_name)
qa_model = BertForQuestionAnswering.from_pretrained(qa_model_name)

# Function to extract answers from document sections
def extract_answers(document_sections, questions):
    answers = []
    for i, (section_embedding, question_embedding) in enumerate(zip(document_sections, question_embeddings)):
        # Convert embeddings to numpy arrays
        section_embedding_np = section_embedding.detach().numpy()
        question_embedding_np = question_embedding.detach().numpy()
        
        # Compute cosine similarity between section embedding and question embedding
        cos_sim = cosine_similarity(section_embedding_np.reshape(1, -1), question_embedding_np.reshape(1, -1))[0][0]
        
        # Alternatively, you can compute L2 norm (Euclidean distance) between embeddings
        # l2_distance = np.linalg.norm(section_embedding_np - question_embedding_np)
        
        # If cosine similarity is above a threshold, consider the section relevant
        if cos_sim > 0.5:  # Adjust the threshold as needed
            # Perform Question Answering on the relevant section
            input_ids = qa_tokenizer.encode(qa_tokenizer.cls_token + qa_tokenizer.sep_token + document_sections[i] + qa_tokenizer.sep_token + questions[i], return_tensors='pt')
            with torch.no_grad():
                start_logits, end_logits = qa_model(input_ids)
            
            # Get the most probable answer span
            answer_start = torch.argmax(start_logits)
            answer_end = torch.argmax(end_logits) + 1
            
            # Convert answer tokens to string
            answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(input_ids[0][answer_start:answer_end]))
            answers.append(answer)
        else:
            answers.append(None)
    return answers

# Example usage:
# Assuming you have document_sections (list of document captions) and questions (list of questions)
document_sections = ["This is the first section of the document.",
                     "Here is the second section of the document.",
                     "And finally, the third section of the document."]
questions = ["What is the document about?",
             "Can you summarize the second section?",
             "What is mentioned in the third section?"]

# Generate embeddings for document sections and questions (you need to have them already)
document_embeddings = [torch.randn(768) for _ in document_sections]  # Example document embeddings
question_embeddings = [torch.randn(768) for _ in questions]  # Example question embeddings

# Extract answers from relevant sections
answers = extract_answers(document_embeddings, question_embeddings)
print("Extracted answers:", answers)

### Step 1.5
Understand the catergorical nature of the data.

(0) Unanswerable, 
(1) Answerable with visuals
(2) Answerable with the script
(3) Answerable only when both script and visuals are provided.

In [68]:
def answerability_stats(data):
    # Initialize a dictionary to store the count of elements for each unique answerability number
    answerability_count = {}
    
    # Iterate through the data
    for key, value in data.items():
        if "answerability" in value:
            answerability_list = value["answerability"]  # Get the list of answerability numbers
            for answerability in answerability_list:
                if answerability in answerability_count:
                    answerability_count[answerability] += 1
                else:
                    answerability_count[answerability] = 1
    return answerability_count

In [69]:
answerability_count = answerability_stats(data)

In [70]:
print(answerability_count)

{1: 1427, 2: 1155, 0: 326, 3: 174}


# Random

In [17]:
def concatenate_caption_texts(json_data):
    """
    Concatenates the text from the 'caption' list for each JSON element.

    Args:
    json_data (dict): A dictionary containing JSON data where each key represents a JSON element.
                      Each JSON element may contain a 'caption' list with text to be concatenated.

    Returns:
    list: A list containing concatenated text strings from the 'caption' lists of each JSON element.
          If a JSON element does not have a 'caption' list or if it is empty, its corresponding text is not included.
    """
    concatenated_texts = []
    for key, value in json_data.items():
        if value.get("caption"):
            caption_texts = [caption["text"] for caption in value["caption"]]
            concatenated_texts.append(" ".join(caption_texts))
    return concatenated_texts

In [18]:
# Extracting transcripts per video
concatenated_texts = concatenate_caption_texts(data)

In [19]:
# Print concatenated texts for testing
# print(concatenated_texts[0])

In [8]:
# Function to extract questions
def extract_questions(json_data):
    """
    Extracts a list of questions from JSON data without iteration.

    Args:
    json_data (dict): A dictionary containing JSON data where each key represents a JSON element.
                      Each JSON element should contain a 'question' key with a question.

    Returns:
    list: A list of questions extracted from the JSON data.
    """
    return [value["question"] for value in json_data.values()]


In [9]:
questions = extract_questions(data)
print("List of questions:", questions)

