In [1]:
# Take database and fit a toy search 
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7f73f96836a0>

In [2]:
# Change the directory to store data from HuggingFace in the appropriate location on the SaturnCloud 
import os
os.environ['HF_HOME'] = '/run/cache/'

to check the GPU in terminal:  `nvidia-smi `; real-time monitoring: `watch nvidia-smi`

(flan-t5 used 11GB, whereas phi3-mini 8 GB)  

In [17]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login


In [4]:
login(token=os.environ['HF_TOKEN'])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /run/cache/token
Login successful


In [5]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [7]:
model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")

In [8]:
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'A list of colors: red, blue, green, yellow, orange, purple, pink,'

In [18]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [23]:
def llm(prompt):
    output = generator(prompt, max_length=200, truncation=True, temperature=0.7, top_p=0.95, num_return_sequences=1)
    response = output[0]['generated_text']
    return response[len(prompt):].strip()

# because it's a completion model we need to change the prompt 
def build_prompt(query, search_results):
    prompt_template = """    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    
    ANSWER:
    
    """.strip()

    context_template = """
    Q: {question}
    A: {text}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + context_template.format(question=doc['question'], text=doc['text']) + f"\n\n" #
        
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=2
    )

    return results


def rag(query):
    search_results = search(query) #toy search engine 
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [24]:
rag("I just discovered the course. Can I still join it?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'Yes, you can still join the course.'

**Explanation of Parameters:**


* `max_length`: Sets the maximum number of tokens to generate. 
* `truncation`: When set to True, it truncates the input sequence to the maximum length if it exceeds it.
* `temperature`: Controls the randomness of the output. A value of 0.7 provides a balance between creativity and coherence.
* `top_p`: Uses nucleus sampling, where only the most probable tokens with cumulative probability less than the specified value (0.95 here) are considered for generation.
* `num_return_sequences`: Specifies the number of alternative sequences to generate. Here, it's set to 1, meaning only one output sequence will be returned.
