In [97]:
import pandas as pd

In [98]:
df =pd.read_csv('../data/data.csv',usecols=lambda x: x != "id")

In [99]:
documents = df.to_dict(orient='records')

In [100]:
df.columns

Index(['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part',
       'type', 'muscle_groups_activated', 'instructions'],
      dtype='object')

In [101]:
df.insert(loc=0, column='id', value=df.index.to_list())

In [102]:
import minsearch

In [103]:
index = minsearch.Index(
    text_fields= df.columns.tolist()[1:],
    keyword_fields= df.columns.tolist()[0]
)

In [104]:
index.fit(documents)

<minsearch.Index at 0x7033e368fce0>

In [65]:
results=index.search(
        query='Push-up',
        filter_dict={},
        boost_dict={},
        num_results=10 )

In [67]:
type(results[0])

dict

In [105]:
def search(query):
    boost = {}
    results = index.search(query =query, filter_dict ={}, boost_dict=boost, num_results=10)
    return results

In [106]:
prompt_template = """
You're a fitness insrtuctor. Answer the QUESTION based on the CONTEXT from our exercises database. 
Use only the facts from the CONTEXT when answering the QUESTION. Lastly, Answer should be translated to Traditional Chinese language.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [107]:
def llm(prompt, model_name = "Qwen/Qwen3-1.7B"):
    from transformers import AutoTokenizer, AutoModelForCausalLM
    import torch

    # load the tokenizer and the model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )    
    
    messages = [{"role": "user", "content": prompt}]    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    # conduct text completion
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=32768
    )
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
    # parsing thinking content
    try:
        # rindex finding 151668 (</think>)
        index = len(output_ids) - output_ids[::-1].index(151668)
    except ValueError:
        index = 0
    
    thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
    print(thinking_content)
    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")    
    
    return content

In [108]:
def rag(query, model_name = "Qwen/Qwen3-1.7B"):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model_name = model_name)
    return answer

In [109]:
question = 'Is the Lat Pulldown considered a strength training activity, and if so, why?'
answer = rag(question)
print(answer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<think>
Okay, let's tackle this question. The user is asking if the Lat Pulldown is considered a strength training activity and why. 

First, I need to check the CONTEXT provided. Looking through the exercises, I see multiple entries for Lat Pulldown. All of them have "type_of_activity: Strength" which indicates that they are strength training exercises. The equipment varies, but it's always a machine or cable machine. The body part is Upper Body, and the muscle groups activated include the Latissimus Dorsi and Biceps. 

The instructions describe pulling the bar down to the chest, which is a common strength movement. The fact that it's categorized as strength training in all the entries supports that. The user might be confused if they think all these exercises are the same, but the context clearly states they are all strength activities. 

So the answer should confirm that yes, the Lat Pulldown is a strength training activity because it targets the latissimus dorsi and biceps, and the

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [8]:
model_name = "Qwen/Qwen3-1.7B"

In [9]:
# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# prepare the model input
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]

In [11]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)


In [13]:
messages

[{'role': 'user',
  'content': 'Give me a short introduction to large language model.'}]

In [14]:
model_inputs

{'input_ids': tensor([[151644,    872,    198,  35127,    752,    264,   2805,  16800,    311,
           3460,   4128,   1614,     13, 151645,    198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 


In [22]:
# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")


In [23]:
print("thinking content:", thinking_content)
print("content:", content)

thinking content: <think>
Okay, the user wants a short introduction to large language models. Let me start by defining what they are. Large language models are AI systems that can understand and generate human-like text. I should mention their key features like training on vast datasets, neural networks, and the ability to generate text on the fly.

Wait, they might not know the specifics. I need to keep it simple. Maybe start with the basics: what they are, how they work, and their applications. Also, highlight their capabilities like language understanding, generation, and reasoning. Should I mention specific examples like GPT or BERT? Probably not necessary for a short intro. Focus on the general concept.

Make sure to explain that they're trained on a lot of data, which allows them to learn patterns and generate coherent text. Also, note that they can be used in various fields like writing, coding, or customer service. Avoid jargon but keep it informative. Check for clarity and con