In [7]:
import instructor
from openai import OpenAI
from typing import List
from pydantic import BaseModel, Field
import os

client = instructor.patch(OpenAI(api_key=os.environ["OPENAI_API_KEY"]))

In [8]:
from ragatouille.data import CorpusProcessor, llama_index_sentence_splitter
path = '/home/adrien/Documents/Coding/RAG_ASN/Ingestion/ASN/lettres_de_suivi/txt'
full_documents = []
nb_docs = 10
for file, i in zip(os.listdir(path), range(0,nb_docs)): 
    
    with open(f"{path}/{file}") as f:
        raw_text = f.read()
    full_documents.append(raw_text)
  

In [9]:
corpus_processor = CorpusProcessor(document_splitter_fn=llama_index_sentence_splitter)
documents = corpus_processor.process_corpus(full_documents, chunk_size=400)

In [10]:
class QuestionsAndQueries(BaseModel):
    hypothetical_questions: List[str] = Field(
        default_factory=list,
        description="A wide variety of hypothetical questions that this document could answer.",
    )
    hypothetical_queries: List[str] = Field(
        default_factory=list,
        description="A wide variety of hypothetical queries that this document would be relevant to, in the context of a search engine or a retrieval pipeline.",
    )

class Summary(BaseModel):
    summary: List[str] = Field(
        default_factory=list,
        description="A summary of the letter.",
    )


In [11]:
candidate_queries = []

summaries = []


for doc in documents: 
         
    candidate = client.chat.completions.create(
            model="gpt-4-1106-preview",
            response_model=QuestionsAndQueries,
            messages=[
                {
                    "role": "system",
                    "content": """You are an expert AI assisting us in creating a high quality, diverse synthetic dataset to train Information Retrieval models. 
                    Your role is to analyse the document chunk given to you and provide us with high quality potential queries.
                    The document is in french so answer in french but without using special characters.""",
                },
                {"role": "user", "content": doc},
            ],
        )

    candidate_queries.append(candidate)
    

    # summary = client.chat.completions.create(
    #         model="gpt-4-1106-preview",
    #         response_model=Summary,
    #         messages=[
    #             {
    #                 "role": "system",
    #                 "content": """You are an expert AI assisting us in creating a high quality, diverse synthetic dataset to train Information Retrieval models. 
    #                 Your role is to analyse the letter given to you and provide us with a summary containing all the facts in the letter.
    #                 The document is in french so answer in french but usong only the standard 26 letters from the alphabet and not the ones with accents.""",
    #             },
    #             {"role": "user", "content": raw_text},
    #         ],
    #     )

    # summaries.append(summary)

print("Document: ")
print(documents[0])

print("Generated queries: ")
candidate_queries[0].model_dump() 

Retrying, exception: 1 validation error for QuestionsAndQueries
  Invalid JSON: EOF while parsing a list at line 13817 column 0 [type=json_invalid, input_value='{"hypothetical_questions...n\n\n\n\n\n\n\n\n\n\n\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.5/v/json_invalid
Traceback (most recent call last):
  File "/home/adrien/Documents/Coding/RAG_ASN/.rag_env/lib/python3.11/site-packages/instructor/patch.py", line 287, in retry_sync
    return process_response(
           ^^^^^^^^^^^^^^^^^
  File "/home/adrien/Documents/Coding/RAG_ASN/.rag_env/lib/python3.11/site-packages/instructor/patch.py", line 165, in process_response
    model = response_model.from_response(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/adrien/Documents/Coding/RAG_ASN/.rag_env/lib/python3.11/site-packages/instructor/function_calls.py", line 137, in from_response
    return cls.model_validate_json(
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/adrien/Documents/C

BadRequestError: Error code: 400 - {'error': {'message': "Sorry! We've encountered an issue with repetitive patterns in your prompt. Please try again with a different prompt.", 'type': 'invalid_request_error', 'param': 'prompt', 'code': None}}

In [12]:
import random

pairs = []
num_questions = 1
num_queries = 1
random.seed(4)
i=0
for candidates, doc in zip(candidate_queries, documents[:-1]):
    candidates = candidates.model_dump()
    queries = random.sample(candidates['hypothetical_questions'], num_questions)
    # if len(candidates['hypothetical_queries']) != 0:
    queries += random.sample(candidates['hypothetical_queries'], num_queries)
    for q in queries:
        pairs.append([q, doc])
    i+=1
pairs

[['A quel article de loi fait reference le controle des installations nucleaires de base ?',
  'RÉPUBLIQUE FRANÇAISE\n\nDIVISION DE MARSEILLE\n\nMarseille, le 20 septembre 2011\n\nN/Réf. :\n\nCODEP-MRS-2011-050321\n\nMonsieur le directeur général délégué de l’établissement MELOX BP 93124 30203 BAGNOLS SUR CEZE Cedex\n\nObjet : Contrôle des Installations nucléaires de base. INB 151 usine MÉLOX à Marcoule\n\nInspection n° INSSN-MRS-2011-0688 du 6 septembre 2011 REX événement significatif du 28 juin 2011 au banc d’assemblage TGM\n\nMonsieur le directeur général délégué,\n\nDans le cadre de la surveillance des installations nucléaires de base prévue à l’article 40 de la loi n°2006-686 du 13 juin 2006, une inspection a eu lieu le 6 septembre 2011 sur le thème mentionné en objet.\n\nFaisant suite aux constatations des inspecteurs de l’ASN formulées à cette occasion, j’ai l’honneur de vous communiquer ci-dessous la synthèse de l’inspection ainsi que les principales demandes et observations qu

In [13]:
from ragatouille import RAGTrainer

trainer = RAGTrainer(model_name="ASNcolBERTv2.0",
                     pretrained_model_name="colbert-ir/colbertv2.0",
                     
                     )



In [14]:
trainer.prepare_training_data(
        raw_data = pairs,
        all_documents = documents,
        num_new_negatives = 10,
        mine_hard_negatives= True,
        )

Loading Hard Negative SimpleMiner dense embedding model BAAI/bge-small-en-v1.5...
Building hard negative index for 93 documents...
All documents embedded, now adding to index...
save_index set to False, skipping saving hard negative index
Hard negative index generated
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining
mining


'./data/'

In [15]:

trainer.train(batch_size=32,
              nbits=4, # How many bits will the trained model use when compressing indexes
              maxsteps=500000, # Maximum steps hard stop
              use_ib_negatives=True, # Use in-batch negative to calculate loss
              dim=128, # How many dimensions per embedding. 128 is the default and works well.
              learning_rate=5e-6, # Learning rate, small values ([3e-6,3e-5] work best if the base model is BERT-like, 5e-6 is often the sweet spot)
              doc_maxlen=400, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
              use_relu=False, # Disable ReLU -- doesn't improve performance
              warmup_steps="auto", # Defaults to 10%
             )


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 4,
    "kmeans_niters": 20,
    "resume": false,
    "similarity": "cosine",
    "bsize": 32,
    "accumsteps": 1,
    "lr": 5e-6,
    "maxsteps": 500000,
    "save_every": 0,
    "warmup": 0,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": "ASNcolBERTv2.0",
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 400,
    "mask_punctuation": true,
    "checkpoint": "colbert-ir\/colbertv2.0",
    "triples": "data\/triples.train.colbert.jsonl",
    "collection": "data

  return self.fget.__get__(instance, owner)()


#> LR will use 0 warmup steps and linear decay over 500000 steps.

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . A quel article de loi fait reference le controle des installations nucleaires de base ?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  1037, 10861,  2140,  3720,  2139,  8840,  2072, 26208,
         2102,  4431,  3393,  2491,  2063,  4078, 14111, 16371, 14321, 14737,
         2015,  2139,  2918,  1029,   102,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')



Process Process-1:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/adrien/Documents/Coding/RAG_ASN/.rag_env/lib/python3.11/site-packages/colbert/infra/launcher.py", line 115, in setup_new_process
    return_val = callee(config, *args)
                 ^^^^^^^^^^^^^^^^^^^^^
  File "/home/adrien/Documents/Coding/RAG_ASN/.rag_env/lib/python3.11/site-packages/colbert/training/training.py", line 103, in train
    scores = colbert(*encoding)
             ^^^^^^^^^^^^^^^^^^
  File "/home/adrien/Documents/Coding/RAG_ASN/.rag_env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/adrien/Documents/Coding/RAG_ASN/.rag_env/lib/python3.11/site-pa

KeyboardInterrupt: 