In [1]:
# !pip install -q torch torchvision torchaudio transformers datasets accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl pacmap ragatouille

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# First Part
import os
import json
import re
import pickle
import jsonlines
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Second Part
import torch
from tqdm import tqdm
from typing import Optional, List, Tuple
from langchain.vectorstores import FAISS
from ragatouille import RAGPretrainedModel
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.docstore.document import Document as LangchainDocument

pd.set_option("display.max_colwidth", None)  # This will be helpful when visualizing retriever outputs

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
path = '/content/drive/MyDrive/mnlpredators-project/' # CHANGE THIS TO YOUR PATH - [NICOLAS]
full_preference_pairs_path = path + 'data/full_preference_pairs.json'

### Preference Pairs Dataset - Questions Extraction

In [5]:
full_preference_pairs = pd.read_json(full_preference_pairs_path, orient='records', lines=False)
print(full_preference_pairs[:1])

In [6]:
# print('Number of questions:', len(full_preference_pairs))

In [7]:
# print 3 full questions
# print(full_preference_pairs[['question_id','course_id','question_complete']].iloc[10])
# print(full_preference_pairs[['question_id','course_id','question_complete']].iloc[20])
# print(full_preference_pairs[['question_id','course_id','question_complete']].iloc[30])

### Initialization of Reranker and Embedding Models

In [8]:
# Reranker model
RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

# Embedding model
EMBEDDING_MODEL_NAME = "thenlper/gte-small"



[May 22, 14:53:08] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




In [10]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    # model_kwargs={"device": "cpu"},
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)



### Loading of the Embedding Vector Database

In [None]:
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local("faiss_index_8_keywords", embedding_model, allow_dangerous_deserialization=True)

### Getting the Most Relevant Document 

In [None]:
def get_most_relevant_document(
    question: str,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 15,
    num_docs_final: int = 5,
) -> Tuple[str, List[LangchainDocument]]:
    # Gather documents with retriever
    # print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Keep only the text

    # Optionally rerank results
    if reranker:
        # print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    # relevant_docs = relevant_docs[:num_docs_final]
    
    # Randomly sample num_docs_final documents
    if len(relevant_docs) > num_docs_final:
        relevant_docs = random.sample(relevant_docs, num_docs_final)

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"\nDocument {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    return relevant_docs, context

#### Testing the Relevance of the Document on a Simple Question

In [None]:
question = "What is a good distance metric to be used when you want to compute the similarity between documents independent of their length?"
relevant_docs, context = get_most_relevant_document(question, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)
print("The context is:", context)

### Generation of an Answer with GPT3.5

In [None]:
import gpt_wrapper
from gpt_wrapper.chat import Chat
from dotenv import load_dotenv
load_dotenv()

False

In [None]:
gpt_wrapper.api_base = "http://mnlp-backend-938795011.eu-central-1.elb.amazonaws.com"
gpt_wrapper.api_key = "96bb5d54-0e6e-4614-ab1d-2e4263f6d20e"

model_args={"temperature": 0.7, "top_p": 0.7, "presence_penalty": 0.0, "frequency_penalty": 0.0, "max_new_tokens": 1024}

In [None]:
def initial_prompt(question, context):
    prompt = f'''Answer the following question: "{question}".
        Use the following context if you deem necessary: "{context}". 
        If the question has options, specify the ID of the correct answer (A, B, C or D).
        Think step by step and explain your reasoning'''   
    return prompt

In [None]:
def generate_predictions_zero_shot(questions, model_args
):
    predictions = []
    instruction="You are a helpful educational AI bot that answers questions for a student. Keep your response truthful and concise"
    with jsonlines.open(f"data_wikipedia/rag_dataset_gpt3.5.jsonl", mode="w") as writer:

        for question_dict in tqdm(questions):
            question = question_dict['question_complete']  # Extract question text
            
            chat_id = random.randrange(0, 2**16,)
            chat = Chat.create(name=f"{chat_id}")
            
            # _, context = get_most_relevant_document(question, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)
            _, context = get_most_relevant_document(question, KNOWLEDGE_VECTOR_DATABASE, reranker=None) # No reranker to have different documents
            # print("The context is:", context)
            prompt = initial_prompt(question, context)
            print("The final prompt is:\n", prompt)
            
            message = chat.ask(prompt, instruction=instruction, model_args=model_args)

            preds = message.content.strip()
            if preds:
                pred = preds
            else:
                pred = "none"

            print("Predicted answer:", preds)
            predictions.append(pred)

            writer.write({"course_id": question_dict['course_id'], "question_id": question_dict['question_id'], 
                    "question_body": question, "answer": preds, "chat_id":chat_id})


### Final Generation - Full Dataset of 1522 Questions (~2h30-3h00)

In [None]:
questions = full_preference_pairs.to_dict('records')
generate_predictions_zero_shot(questions, model_args)

  0%|          | 0/1522 [00:00<?, ?it/s]

The final prompt is:
 Answer the following question: "Question: Consider the following contains function defined on Iterable (in particular, it accepts both Vector and List).  def contains[A](l: Iterable[A], elem: A): Boolean =   val n = l.size   if n <= 5 then     for i <- l do       if i == elem then         return true     false   else     val (p0, p1) = parallel(       contains(l.take(n / 2), elem),       contains(l.drop(n / 2), elem)     )   p0 || p1 Let $n$$n$ be the size of l. Assume that drop and take run in $\Theta(1)$ on Vector and $\Theta(n)$ on List. What is the asymptotic depth of contains if it is called on a List?".
        Use the following context if you deem necessary: "
Extracted documents:

Document 0:::
Since  does not contain the infinity at ∞, the construction can equally be applied to  taking  with horizontal slits removed to give a uniformizer . The uniformizer  now takes  to  with parallel slits removed at an angle of  to the -axis. In particular  =  leads to 

  0%|          | 1/1522 [00:06<2:35:34,  6.14s/it]

Predicted answer: The asymptotic depth of the contains function when called on a List is log(n), where n is the size of the List. This is because the function recursively splits the List in half until the size of the List is less than or equal to 5, resulting in a binary tree structure with a depth of log(n).
The final prompt is:
 Answer the following question: "Question: What is the asymptotic work of <code>parGroupyBy2</code>?".
        Use the following context if you deem necessary: "
Extracted documents:

Document 0:::
In mathematics, specifically group theory, a descendant tree is a hierarchical structure that visualizes parent-descendant relations between isomorphism classes of finite groups of prime power order , for a fixed prime number  and varying integer exponents . Such groups are briefly called finite p-groups. The vertices of a descendant tree are isomorphism classes of finite p-groups.

Additionally to their order , finite p-groups have two further related invariants, t

  0%|          | 2/1522 [00:11<2:23:39,  5.67s/it]

Predicted answer: The asymptotic work of `parGroupBy2` is not provided in the extracted documents.
The final prompt is:
 Answer the following question: "Question: We have a collection of rectangles in a plane, whose sides are aligned with the coordinate axes. Each rectangle is represented by its lower left corner $(x_1,y_1)$ and its upper right corner $(x_2,y_2)$. All coordinates are of type Long. We require $x_1 \le x_2$ and $y_1 \le y_2$. Define a case class Rectangle storing two corners. ".
        Use the following context if you deem necessary: "
Extracted documents:

Document 0:::
To describe a finite projective plane of order N(≥ 2) using non-homogeneous coordinates and a planar ternary ring:
Let one point be labelled (∞).
Label N points, (r) where r = 0, ..., (N − 1).
Label N2 points, (r, c) where r, c = 0, ..., (N − 1).
On these points, construct the following lines:
One line [∞] = { (∞), (0), ..., (N − 1)}
N lines [c] = {(∞), (c,0), ..., (c, N − 1)}, where c = 0, ..., (N − 1)

  0%|          | 3/1522 [00:19<2:53:31,  6.85s/it]

Predicted answer: The question asks to define a case class Rectangle storing two corners $(x_1,y_1)$ and $(x_2,y_2)$. The case class Rectangle can be defined in Scala as follows:

```scala
case class Rectangle(x1: Long, y1: Long, x2: Long, y2: Long)
```

This case class represents a rectangle with its lower left corner at coordinates $(x_1, y_1)$ and its upper right corner at coordinates $(x_2, y_2)$.
The final prompt is:
 Answer the following question: "Question: Which of the following scheduler policies are preemptive?

Options:
A. FIFO (First In, First Out)
B. SJF (Shortest Job First)
C. STCF (Shortest Time to Completion First)
D. RR (Round Robin)".
        Use the following context if you deem necessary: "
Extracted documents:

Document 0:::
A* uses this heuristic to improve on the behavior relative to Dijkstra's algorithm. When the heuristic evaluates to zero, A* is equivalent to Dijkstra's algorithm. As the heuristic estimate increases and gets closer to the true distance, A* con

  0%|          | 4/1522 [00:26<2:47:38,  6.63s/it]

Predicted answer: The preemptive scheduler policies among the options provided are:
C. STCF (Shortest Time to Completion First)
D. RR (Round Robin)
The final prompt is:
 Answer the following question: "Question: In this week's lecture, you have been introduced to the aggregate method of ParSeq[A] (and other parallel data structures). It has the following signature:  def aggregate[B](z: B)(f: (B, A) => B, g: (B, B) => B): B Discuss, as a group, what aggregate does and what its arguments represent. Consider the parallel sequence xs containing the three elements x1, x2 and x3. Also consider the following call to aggregate:  xs.aggregate(z)(f, g) The above call might potentially result in the following computation:  f(f(f(z, x1), x2), x3) But it might also result in other computations. Come up with at least two other computations in terms of f and g that may result from the above call to aggregate.  Below are other examples of calls to aggregate. In each case, check if the call can lead to

  0%|          | 5/1522 [00:31<2:41:08,  6.37s/it]

Predicted answer: I'm sorry, but the provided question is related to computer science and requires a detailed understanding of parallel data structures and computations. If you have any specific queries or need clarification on any concept, feel free to ask.
The final prompt is:
 Answer the following question: "Question: Suppose we run JOS and set a breakpoint at syscall (in lib/syscall.c). What are the Current Privilege Level (CPL) before invoking the syscall function and after executing the int 0x30 instruction?

Options:
A. 0 3
B. 0 0
C. 3 0
D. 3 3".
        Use the following context if you deem necessary: "
Extracted documents:

Document 0:::
The German standard DIN 66303 is a character set standard, which is used for character encoding in computer systems. The standard DIN 66303 bears the title "Information Technology: 8-Bit-Code" and was established in November 1986 (DIN 66303:1986-11). The most recent edition is from June 2000 (DIN 66303:2000-06).

The character set of the 2000 

  0%|          | 6/1522 [00:38<2:38:57,  6.29s/it]

Predicted answer: The Current Privilege Level (CPL) before invoking the syscall function is 3, and after executing the int 0x30 instruction, the CPL is 0. Therefore, the correct option is C. 3 0.


  0%|          | 6/1522 [00:38<2:41:04,  6.38s/it]


KeyboardInterrupt: 