## Imports

In [None]:
!pip install jsonlines
import json
import tqdm
import jsonlines
from google.colab import drive
drive.mount('/content/gdrive')
COLAB_RUN = True
if COLAB_RUN:
  base_path = "./gdrive/MyDrive/ChatGPT-RetrievalQA-private/"
  prompts_base_path = "./gdrive/MyDrive/ChatGPT-RetrievalQA-private/prompts/"
  dataset_path = "./gdrive/MyDrive/ChatGPT-RetrievalQA/"
else:
  base_path = "./"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jsonlines
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.1.0
Mounted at /content/gdrive


## Utils


### read collections

In [None]:
def read_collection(f_path):
  corpus = {}
  with open(f_path, "r") as fp:
    for line in tqdm.tqdm(fp, desc="reading {}".format(f_path)):
      did, dtext = line.strip().split("\t")
      corpus[did] = dtext
  return corpus

### normalize content function


In [None]:
def normalize_content(content):
  return content.replace('\r', '').replace('\n', '').replace('\t', ' ')

### write to file function

In [None]:
def write_to_file(output_path, content):
  f_w = open(output_path, "w+")
  f_w.write("".join(content))
  f_w.close()

## Load model

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
os.environ["HF_ENDPOINT"] = "https://huggingface.co"
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m").to("cuda")

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

# Prompt template class

In [None]:
class PromptTemplate:
  def __init__(self, model, tokenizer, template= "{}", prefix= "", max_new_tokens = 200, edit_output = False):
    self.model = model
    self.tokenizer = tokenizer
    self.template = template
    self.prefix = prefix
    self.max_new_tokens = max_new_tokens
    self.edit_output = edit_output
  # loading the template from the file. The template can be passes as string too.
  def load_template(self, tempalte_path):
    try:
      self.template = open(tempalte_path, "r").read()
    except Exception as e:
      pass
    try:
      self.template = open(tempalte_path, "r").read()
    except Exception as e:
      print("error: ", e)
  # main function to generate the text.
  def generate_text(self):
      return "Implement this function on children classes :)"
  # Checking some rules before generating the text
  def check_rules(self, input_str):
    return True
  # pick the infromation from the ouput of the model. It is especially needed for autoregressive models. For seq2seq models you will only get the text and the select_information do not need to be overrided
  def select_information(self, output_str):
    return output_str
  # generate text basic function with a list of prompts arguments
  def generate_text_basic(self, prompt_arg):
    if type(prompt_arg) == str:
      prompt_arg = [prompt_arg]
    template = self.prefix + self.template.format(*prompt_arg)
    tokens = tokenizer(template, return_tensors="pt").to(0)
    outputs = model.generate(input_ids=tokens["input_ids"], attention_mask=tokens["attention_mask"], max_new_tokens = self.max_new_tokens, eos_token_id= tokenizer.eos_token_id, pad_token_id = tokenizer.pad_token_id)
    str_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return str_output

# 📶 Explanding query


## Class

In [None]:
class ExpandingQuery(PromptTemplate):
  def __init__(self, *args, **kwargs):
    super(ExpandingQuery, self).__init__(*args, **kwargs)
  # query with more than 10 words won't be expanded
  def check_rules(self, input_str):
    long_query_words = 10
    long_query_chars = 10 * 4.7
    if len(input_str)> long_query_chars:
      return False # query with more than 10 words is lengthy enough and we do not expand it :) we consider a query with more than 10 words as a long query inspired by: https://trec.nist.gov/pubs/trec30/papers/Overview-DL.pdf, section 2, paragraph 2.
  def select_information(self, str_output):
    # select = "Example 4:\n" + str_output.split("Example 4:\n")[1]#.split("Query Expanded:")[1].split("\n")[0]
    select = str_output
    return select
  def generate_text(self, query):
    status = False
    if self.check_rules(query) == False:
      return (query, status) # if nothing happens because of the rules, then return false to show we return original content.
    str_output = super(ExpandingQuery, self).generate_text_basic(query)
    return (self.select_information(str_output).strip(), True) # True is status! it means we really expanded the query!

#Demo token highlighting impact



## highlighting token: []

In [None]:
query_expander = ExpandingQuery(model, tokenizer)
query_expander.template = """Generate "Relevant Document" based on the "Query". The "Relevant Document" must provide meaningful information according to the "Query". The terms within [] are "Highlighted" terms of the "Query".

Query: What is the recommended amount of [caffeine] intake during [pregnancy], and are there any potential risks associated with consuming small amounts of [caffeine] while [pregnant]?
Relevant Document: We don't know a lot about the effects of caffeine during pregnancy on you and your baby. So it's best to limit the amount you get each day. If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ounce cups of coffee or one 12-ounce cup of coffee.

Example 2:
Query: Which [fruit] is exclusive to [Australia] and provide some additional details about it?
Relevant Document: Passiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.assiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.

Example 3:
Query: What is the size of the [canadian military] ahd what is the number of active personnel and reserve members?
Relevant Document: The Canadian Armed Forces. 1  The first large-scale Canadian peacekeeping mission started in Egypt on November 24, 1956. 2  There are approximately 65,000 Regular Force and 25,000 reservist members in the Canadian military. 3  In Canada, August 9 is designated as National Peacekeepers' Day.

Example 4:
Query: What is the [conversion] of [stereo signal] to [mono signal]?
Relevant Document:"""
generated_text, status = query_expander.generate_text("")
print("generated_text: ", generated_text)

generated_text:  Generate "Relevant Document" based on the "Query". The "Relevant Document" must provide meaningful information according to the "Query". The terms within [] are "Highlighted" terms of the "Query".

Query: What is the recommended amount of [caffeine] intake during [pregnancy], and are there any potential risks associated with consuming small amounts of [caffeine] while [pregnant]?
Relevant Document: We don't know a lot about the effects of caffeine during pregnancy on you and your baby. So it's best to limit the amount you get each day. If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ounce cups of coffee or one 12-ounce cup of coffee.

Example 2:
Query: Which [fruit] is exclusive to [Australia] and provide some additional details about it?
Relevant Document: Passiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit 

## highlighting token: *


In [None]:
query_expander = ExpandingQuery(model, tokenizer)
query_expander.template = """Generate "Relevant Document" based on the "Query". The "Relevant Document" must provide meaningful information according to the "Query". The terms within ** are "Highlighted" terms of the "Query".

Query: What is the recommended amount of *caffeine* intake during *pregnancy*, and are there any potential risks associated with consuming small amounts of *caffeine* while *pregnant*?
Relevant Document: We don't know a lot about the effects of caffeine during pregnancy on you and your baby. So it's best to limit the amount you get each day. If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ounce cups of coffee or one 12-ounce cup of coffee.

Example 2:
Query: Which *fruit* is exclusive to *Australia* and provide some additional details about it?
Relevant Document: Passiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.assiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.

Example 3:
Query: What is the size of the *canadian military* ahd what is the number of active personnel and reserve members?
Relevant Document: The Canadian Armed Forces. 1  The first large-scale Canadian peacekeeping mission started in Egypt on November 24, 1956. 2  There are approximately 65,000 Regular Force and 25,000 reservist members in the Canadian military. 3  In Canada, August 9 is designated as National Peacekeepers' Day.

Example 4:
Query: What is the *conversion* of *stereo signal* to *mono signal*?
Relevant Document:"""
generated_text, status = query_expander.generate_text("")
print("generated_text: ", generated_text)

generated_text:  Generate "Relevant Document" based on the "Query". The "Relevant Document" must provide meaningful information according to the "Query". The terms within ** are "Highlighted" terms of the "Query".

Query: What is the recommended amount of *caffeine* intake during *pregnancy*, and are there any potential risks associated with consuming small amounts of *caffeine* while *pregnant*?
Relevant Document: We don't know a lot about the effects of caffeine during pregnancy on you and your baby. So it's best to limit the amount you get each day. If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ounce cups of coffee or one 12-ounce cup of coffee.

Example 2:
Query: Which *fruit* is exclusive to *Australia* and provide some additional details about it?
Relevant Document: Passiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit 

## highlighting token: <>

## highlighting token: ()

In [None]:
query_expander = ExpandingQuery(model, tokenizer)
query_expander.template = """Generate "Relevant Document" based on the "Query". The "Relevant Document" must provide meaningful information according to the "Query". The terms within <> are "Highlighted" terms of the "Query".

Query: What is the recommended amount of <caffeine> intake during <pregnancy>, and are there any potential risks associated with consuming small amounts of <caffeine> while <pregnant>?
Relevant Document: We don't know a lot about the effects of caffeine during pregnancy on you and your baby. So it's best to limit the amount you get each day. If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ounce cups of coffee or one 12-ounce cup of coffee.

Example 2:
Query: Which <fruit> is exclusive to <Australia> and provide some additional details about it?
Relevant Document: Passiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.assiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.

Example 3:
Query: What is the size of the <canadian military> ahd what is the number of active personnel and reserve members?
Relevant Document: The Canadian Armed Forces. 1  The first large-scale Canadian peacekeeping mission started in Egypt on November 24, 1956. 2  There are approximately 65,000 Regular Force and 25,000 reservist members in the Canadian military. 3  In Canada, August 9 is designated as National Peacekeepers' Day.

Example 4:
Query: What is the <conversion> of <stereo signal> to <mono signal>?
Relevant Document:"""
generated_text, status = query_expander.generate_text("")
print("generated_text: ", generated_text)

generated_text:  Generate "Relevant Document" based on the "Query". The "Relevant Document" must provide meaningful information according to the "Query". The terms within <> are "Highlighted" terms of the "Query".

Query: What is the recommended amount of <caffeine> intake during <pregnancy>, and are there any potential risks associated with consuming small amounts of <caffeine> while <pregnant>?
Relevant Document: We don't know a lot about the effects of caffeine during pregnancy on you and your baby. So it's best to limit the amount you get each day. If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ounce cups of coffee or one 12-ounce cup of coffee.

Example 2:
Query: Which <fruit> is exclusive to <Australia> and provide some additional details about it?
Relevant Document: Passiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit 

## highlighting token: ()

In [None]:
query_expander = ExpandingQuery(model, tokenizer)
query_expander.template = """Generate "Relevant Document" based on the "Query". The "Relevant Document" must provide meaningful information according to the "Query". The terms within () are "Highlighted" terms of the "Query".

Query: What is the recommended amount of (caffeine) intake during (pregnancy), and are there any potential risks associated with consuming small amounts of (caffeine) while (pregnant)?
Relevant Document: We don't know a lot about the effects of caffeine during pregnancy on you and your baby. So it's best to limit the amount you get each day. If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ounce cups of coffee or one 12-ounce cup of coffee.

Example 2:
Query: Which (fruit) is exclusive to (Australia) and provide some additional details about it?
Relevant Document: Passiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.assiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit as edible, sweet and tasty, while others list the fruits as being bitter and inedible.

Example 3:
Query: What is the size of the (canadian military) ahd what is the number of active personnel and reserve members?
Relevant Document: The Canadian Armed Forces. 1  The first large-scale Canadian peacekeeping mission started in Egypt on November 24, 1956. 2  There are approximately 65,000 Regular Force and 25,000 reservist members in the Canadian military. 3  In Canada, August 9 is designated as National Peacekeepers' Day.

Example 4:
Query: What is the (conversion) of (stereo signal) to (mono signal)?
Relevant Document:"""
generated_text, status = query_expander.generate_text("")
print("generated_text: ", generated_text)

generated_text:  Generate "Relevant Document" based on the "Query". The "Relevant Document" must provide meaningful information according to the "Query". The terms within () are "Highlighted" terms of the "Query".

Query: What is the recommended amount of (caffeine) intake during (pregnancy), and are there any potential risks associated with consuming small amounts of (caffeine) while (pregnant)?
Relevant Document: We don't know a lot about the effects of caffeine during pregnancy on you and your baby. So it's best to limit the amount you get each day. If you are pregnant, limit caffeine to 200 milligrams each day. This is about the amount in 1½ 8-ounce cups of coffee or one 12-ounce cup of coffee.

Example 2:
Query: Which (fruit) is exclusive to (Australia) and provide some additional details about it?
Relevant Document: Passiflora herbertiana. A rare passion fruit native to Australia. Fruits are green-skinned, white fleshed, with an unknown edible rating. Some sources list the fruit 

## W/o highlighting

In [None]:
query_expander = ExpandingQuery(model, tokenizer)
query_expander.template = """Expand the query but do not change the main question.

Example 1:
Query: Is a little caffeine ok during pregnancy?
Query Expanded: What is the recommended amount of caffeine intake during pregnancy, and are there any potential risks associated with consuming small amounts of caffeine while pregnant?

Example 2:
Query: What fruit is native to Australia?
Query Expanded: Which fruit is exclusive to Australia and provide some additional details about it?

Example 3:
Query: How large is the canadian military?
Query Expanded: What is the size of the canadian military ahd what is the number of active personnel and reserve members?

Example 4:
Query: converting stereo signal to mono signal is called
Query Expanded:"""
generated_text, status = query_expander.generate_text("")
print("generated_text: ", generated_text)

generated_text:  Expand the query but do not change the main question.

Example 1:
Query: Is a little caffeine ok during pregnancy?
Query Expanded: What is the recommended amount of caffeine intake during pregnancy, and are there any potential risks associated with consuming small amounts of caffeine while pregnant?

Example 2:
Query: What fruit is native to Australia?
Query Expanded: Which fruit is exclusive to Australia and provide some additional details about it?

Example 3:
Query: How large is the canadian military?
Query Expanded: What is the size of the canadian military ahd what is the number of active personnel and reserve members?

Example 4:
Query: converting stereo signal to mono signal is called
Query Expanded: What is the conversion of stereo signal to mono signal?
Query Expanded: What is the conversion of stereo signal to mono signal?
Query Expanded: What is the conversion of stereo signal to mono signal?
Query Expanded: What is the conversion of stereo signal to mono si