In [None]:
!pip install chromadb -q
!pip install groq -q
!pip install langchain -q
!pip install langchain-groq -q
!pip install langchain_community -q
!pip install langchain_huggingface -q
!pip install langchain_google_genai -q
!pip install faiss-cpu
!pip install -qU langchain-google-genai

In [None]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import time
import shutil

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Long Range Video Question Answering/Original Dataset/Questions_Answer.csv")
df.head()

In [None]:
## load the GOOGLE API KEY
import os

os.environ["GOOGLE_API_KEY"] = ""

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
)

prompt_2 = ChatPromptTemplate.from_template(
    """
    User:
    Act as an expert in Bengali MCQ Answer Generation.
    Please provide a single-letter answer (0, 1, 2, 3, 4) to the following multiple-choice question,
    and your answer must be one of the letters (0, 1, 2, 3, or 4). You must not provide any other
    response or explanation.
    You are given some language descriptions in Bengali of a first person view video. The video is 3 minute long.
    Each sentence describes a 1s clip. Here are the descriptions:
    <context>
    {context}
    <context>
    You are going to answer a multiple choice question based on only the descriptions.
    Here is the question:
    Question:{input}

    Assistant:

    """
)

In [None]:
def vector_embedding(content):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    # create a document object from a string.
    doc = Document(page_content = content)
    docs = [doc] #put the document into a list.
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
    final_documents = text_splitter.split_documents(docs)
    vectors = FAISS.from_documents(final_documents, embeddings)
    return vectors

def query_documents(question, vector_store):

    document_chain = create_stuff_documents_chain(llm, prompt_2)
    retriever = vector_store.as_retriever()
    retrieval_chain = create_retrieval_chain(retriever, document_chain)
    response = retrieval_chain.invoke({'input': question})

    return response

def delete_vector_store():
    try:
        shutil.rmtree("./faiss_index") #faiss creates a folder called faiss_index
        print("Vector database deleted successfully.")
    except FileNotFoundError:
        print("Vector database not found.")
    except Exception as e:
        print(f"Error deleting vector database: {e}")

In [None]:
pred = []
time_taken = []
for _, row in df.iterrows():
    start = time.process_time()
    content = row['bengali_text']
    question = row['question']
    options = {
        'option 0': row['option 0'],
        'option 1': row['option 1'],
        'option 2': row['option 2'],
        'option 3': row['option 3'],
    }

    options = [(row['option 0'], row['option 1'], row['option 2'], row['option 3'], row['option 4'])]
    vector_store = vector_embedding(content)
    print("Vector Store DB Is Ready")

    prompt1 = question + " " + "Here are the options:" + "\n" + str(options)

    if prompt1:
        response = query_documents(prompt1, vector_store)
        print("Answer:", response['answer'])
        pred.append(response['answer'])
        time_ = time.process_time() - start
        print("Response time :", time_)
        time_taken.append(time_)
        print("\nDocument Similarity Search:")
        for i, doc in enumerate(response["context"]):
            print(doc.page_content)
            print("--------------------------------")

        delete_vector_store() #delete the vector store after the response.
    time.sleep(60)

Prompt 1 - Acc. 50.0, time_taken - 0.0214
```python
['2',
 '1',
 '2',
 '4',
 '0',
 '2',
 '0',
 '0',
 '2',
 '0',
 '4',
 '3',
 '1',
 '0',
 '1',
 '4',
 '0',
 '0',
 '3',
 '0',
 '0',
 '0',
 '4',
 '4',
 '0',
 '4']
```

Prompt 2 - Acc. 57.69, time_taken - 0.0216
```python
pred =
['2',
 '1',
 '2',
 '4',
 '0',
 '0',
 '0',
 '4',
 '2',
 '0',
 '4',
 '3',
 '0',
 '0',
 '0',
 '4',
 '1',
 '0',
 '3',
 '0',
 '2',
 '0',
 '4',
 '4',
 '0',
 '4']
```

`chunk_size`=1000,`chunk_overlap=20`, acc = 0.6923076923076923, time_taken = 0.025107139884615555s

```python
pred=['2',
 '1',
 '2',
 '4',
 '0',
 '0',
 '0',
 '0',
 '3',
 '0',
 '4',
 '3',
 '0',
 '4',
 '0',
 '4',
 '1',
 '0',
 '3',
 '0',
 '2',
 '0',
 '4',
 '4',
 '1',
 '4']
```

acc = 0.7307692307692307, time_taken = 0.02216396149999996s

```python

pred =
['2',
 '1',
 '2',
 '4',
 '0',
 '0',
 '4',
 '0',
 '3',
 '0',
 '4',
 '3',
 '0',
 '4',
 '0',
 '4',
 '1',
 '0',
 '3',
 '0',
 '2',
 '0',
 '4',
 '4',
 '1',
 '4']
 ```
 `chunk_size`=1000, `chunk_overlap`=0, `acc`=0.5384615384615384, `time_taken`=0.02125196250000019

 ```python
 pred=
 ['2',
 '1',
 '2',
 '4',
 '0',
 '0',
 '0',
 '0',
 '2',
 '0',
 '4',
 '3',
 '0',
 '0',
 '0',
 '4',
 '1',
 '0',
 '3',
 '0',
 '2',
 '0',
 '4',
 '0',
 '0',
 '4']
 ```

  `chunk_size`=1000, `chunk_overlap`=100, `acc`=0.5769230769230769, `time_taken`=0.021861508538461336

 ```python
 pred=
 ['2',
 '1',
 '2',
 '4',
 '0',
 '0',
 '0',
 '2',
 '3',
 '0',
 '0',
 '3',
 '0',
 '0',
 '0',
 '4',
 '1',
 '0',
 '3',
 '0',
 '2',
 '0',
 '4',
 '0',
 '1',
 '4']
 ```
 `chunk_size`=1000, `chunk_overlap`=25, `acc`=0.6153846153846154, `time_taken`=0.025856383346153828s
 ```python
 pred =
 ['2',
 '1',
 '2',
 '4',
 '0',
 '2',
 '4',
 '0',
 '3',
 '0',
 '4',
 '3',
 '0',
 '0',
 '0',
 '4',
 '1',
 '0',
 '3',
 '0',
 '2',
 '0',
 '4',
 '0',
 '1',
 '4']
 ```

 `chunk_size`=1000, `chunk_overlap`=15, `acc`=0.6923076923076923, `time_taken`=0.02120989950000004s
 ```python
 pred =
 ['2',
 '1',
 '2',
 '4',
 '0',
 '0',
 '0',
 '0',
 '3',
 '0',
 '4',
 '3',
 '0',
 '4',
 '0',
 '4',
 '1',
 '0',
 '3',
 '0',
 '2',
 '0',
 '4',
 '4',
 '1',
 '4']
 ```
 `chunk_size`=1000, `chunk_overlap`=10, `acc`=0.5769230769230769

, `time_taken`=0.021887588461538324s

 ```python
 pred =
 ['2',
 '1',
 '2',
 '4',
 '0',
 '0',
 '0',
 '0',
 '2',
 '0',
 '4',
 '3',
 '0',
 '0',
 '0',
 '4',
 '1',
 '0',
 '3',
 '0',
 '2',
 '0',
 '4',
 '4',
 '0',
 '4']
 ```

In [None]:
truth = df[['truth']]

In [None]:
sum(1 for x,y in zip(truth,pred) if x == y) / len(truth)

In [None]:
sum(time_taken)/26