If you're not running in Saturn Cloud, you need to install these libraries:

Make sure you use the latest versions

```
pip install -U transformers accelerate bitsandbytes
```

In [1]:
# to clear Saturn cache
from transformers import TRANSFORMERS_CACHE
print(TRANSFORMERS_CACHE)

/home/jovyan/.cache/huggingface/hub


In [2]:
import shutil
shutil.rmtree(TRANSFORMERS_CACHE)

In [1]:
import os

# install the hugging face packages here
os.environ['HF_HOME'] = '/run/cache/'

In [4]:
# os.getenv('HF_TOKEN')

In [3]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-07 03:58:49--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-07 03:58:49 (64.4 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [2]:
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7f1f785d79a0>

In [25]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=3
    )

    return results

In [4]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [6]:
# lists folders and space
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         100G   36G   65G  36% /
tmpfs            64M     0   64M   0% /dev
tmpfs           7.7G     0  7.7G   0% /sys/fs/cgroup
/dev/nvme0n1p1  100G   36G   65G  36% /run
tmpfs            14G     0   14G   0% /dev/shm
/dev/nvme2n1    2.0G  141M  1.8G   8% /home/jovyan
tmpfs            14G  120K   14G   1% /home/jovyan/.saturn
tmpfs            14G   12K   14G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           7.7G   12K  7.7G   1% /proc/driver/nvidia
tmpfs           7.7G  3.7M  7.7G   1% /run/nvidia-persistenced/socket
tmpfs           7.7G     0  7.7G   0% /proc/acpi
tmpfs           7.7G     0  7.7G   0% /sys/firmware


In [5]:
from huggingface_hub import login

In [6]:
login(token=os.getenv('HF_TOKEN2'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /run/cache/token
Login successful


In [7]:
# Load model directly w/ access token
from transformers import AutoTokenizer, AutoModelForCausalLM

In [8]:
access_token = os.getenv('HF_TOKEN2')

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True, token=access_token
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left", token=access_token)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [10]:
# for the test below
# model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")

In [11]:
# test it and then place in llm()
# generated_ids = model.generate(**model_inputs)
# tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'A list of colors: red, blue, green, yellow, orange, purple, pink,'

In [18]:
# Use a pipeline as a high-level helper
from transformers import pipeline

In [19]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [22]:
def build_prompt(query, search_results):
    prompt_template = """
QUESTION: {question}

CONTEXT:
{context}

ANSWER:
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"{doc['question']}\n{doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [26]:
def llm(prompt):
    response = generator(prompt, max_length=600, temperature=0.7, top_p=0.95, num_return_sequences=1)
    response_final = response[0]['generated_text']
    return response_final[len(prompt):].strip()

In [27]:
rag("I just discovered the course. Can I still join it?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'Yes, you can still join the course.'