In [1]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
import re
import mlflow
import time
import numpy as np
from sentence_transformers import util as st_util
import torch


In [2]:
mlflow.set_tracking_uri("file:./mlruns")  # local directory
mlflow.set_experiment("AUIChat-Embedding-Experiments")


<Experiment: artifact_location='file:///home/barneh/Rag-Based-LLM_AUIChat/rag_based_llm_auichat/notebooks/mlruns/180508001129329925', creation_time=1742769210427, experiment_id='180508001129329925', last_update_time=1742769210427, lifecycle_stage='active', name='AUIChat-Embedding-Experiments', tags={}>

2️⃣ Utility Functions 
Clean Text

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^a-zA-Z0-9,.!?;:\'\"()\[\]\s]', '', text)
    return text


3️⃣ Define Embedding Models to Compare 

In [4]:
embedding_models = {
    "msmarco": "sentence-transformers/msmarco-distilbert-base-v4",
    "miniLM": "sentence-transformers/all-MiniLM-L6-v2",
    "bge": "BAAI/bge-small-en-v1.5"
}

4️⃣ Define Chunking Strategies 

In [5]:
chunk_configs = [
    {"chunk_size": 400, "overlap": 50},
    {"chunk_size": 250, "overlap": 25},
    {"chunk_size": 100, "overlap": 0}
]

5️⃣ Load & Preprocess Documents 📄

In [6]:
documents = SimpleDirectoryReader("/home/barneh/Rag-Based-LLM_AUIChat/raw").load_data()

6️⃣ Helper Function for Running Experiments 🧪

In [7]:
def calculate_relevance_score(query, source_nodes):
    """Calculate relevance score between query and retrieved chunks"""
    if not source_nodes:
        return 0.0
    
    # Get the query embedding using the current embed model
    query_embedding = Settings.embed_model.get_query_embedding(query)
    
    # Get embeddings for all source nodes
    relevance_scores = []
    for node in source_nodes:
        if hasattr(node, 'embedding') and node.embedding is not None:
            node_embedding = node.embedding
        else:
            # If node doesn't have embedding, compute it
            node_embedding = Settings.embed_model.get_text_embedding(node.get_text())
            
        # Calculate cosine similarity
        if isinstance(query_embedding, list) and isinstance(node_embedding, list):
            query_tensor = torch.tensor([query_embedding])
            node_tensor = torch.tensor([node_embedding])
            score = st_util.pytorch_cos_sim(query_tensor, node_tensor).item()
        else:
            score = np.dot(query_embedding, node_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(node_embedding))
            
        relevance_scores.append(score)
    
    # Return average relevance score
    return sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0.0

def count_tokens(text):
    """Approximate token count using a simple whitespace-based approach"""
    return len(text.split())

def run_experiment(model_name, model_path, chunk_size, overlap, documents):
    """Run a single experiment with specific model and chunk configuration"""
    print(f"\n🔍 Testing with model: {model_name}")
    print(f"📏 Chunk size: {chunk_size}, Overlap: {overlap}")
    
    # Set up embedding model
    embed_model = HuggingFaceEmbedding(model_name=model_path)
    Settings.embed_model = embed_model
    
    # Start MLflow run
    with mlflow.start_run(run_name=f"{model_name}_{chunk_size}"):
        # Log parameters
        mlflow.log_param("embedding_model", model_name)
        mlflow.log_param("chunk_size", chunk_size)
        mlflow.log_param("overlap", overlap)
        
        # Process documents
        start_time = time.time()
        
        splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
        nodes = splitter.get_nodes_from_documents(documents)
        
        for node in nodes:
            node.text = clean_text(node.text)
            
        # Create index
        storage_context = StorageContext.from_defaults()
        index = VectorStoreIndex(nodes, storage_context=storage_context)
        
        # Log metrics about the index
        index_build_time = time.time() - start_time
        mlflow.log_metric("index_build_time_seconds", index_build_time)
        mlflow.log_metric("total_chunks", len(nodes))
        
        # Set up query engine
        query_engine = index.as_query_engine(
            llm=HuggingFaceInferenceAPI(
                model_name="mistralai/Mistral-7B-Instruct-v0.3",
                token="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"
            )
        )
        
        # Run query with timing
        query = "What are the requirements for the PiP program?"
        query_start_time = time.time()
        response = query_engine.query(query)
        query_time = time.time() - query_start_time
        
        # Extract metrics from response
        source_nodes = getattr(response, 'source_nodes', [])
        retrieved_chunks = len(source_nodes) if source_nodes else 0
        response_text = str(response)
        response_tokens = count_tokens(response_text)
        
        # Calculate relevance score
        relevance_score = calculate_relevance_score(query, source_nodes)
        
        # Log response metrics
        mlflow.log_metric("response_time_seconds", query_time)
        mlflow.log_metric("retrieved_chunks", retrieved_chunks)
        mlflow.log_metric("token_count", response_tokens)
        mlflow.log_metric("relevance_score", relevance_score)
        mlflow.log_metric("tokens_per_second", response_tokens / query_time if query_time > 0 else 0)
        
        # Log response text
        mlflow.log_text(str(response), "response.txt")
        
        # Log source nodes for analysis
        if source_nodes:
            source_texts = "\n\n---\n\n".join([node.get_text() for node in source_nodes])
            mlflow.log_text(source_texts, "source_chunks.txt")
        
        print("🧠 Response:")
        print(response)
        print(f"⏱️ Response Time: {query_time:.2f} seconds")
        print(f"📊 Relevance Score: {relevance_score:.4f}")
        print(f"🧩 Chunks Retrieved: {retrieved_chunks}")
        print(f"🔤 Token Count: {response_tokens}")
        
        return response

7️⃣ Run Individual Experiments 🧪

Each experiment is separated into its own cell to prevent VS Code from crashing.

### Model: msmarco with chunk size: 400

In [8]:
# Run msmarco with chunk size 400
model_name = "msmarco"
model_path = embedding_models[model_name]
cfg = chunk_configs[0]  # {"chunk_size": 400, "overlap": 50}
run_experiment(model_name, model_path, cfg["chunk_size"], cfg["overlap"], documents)


🔍 Testing with model: msmarco
📏 Chunk size: 400, Overlap: 50


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/msmarco-distilbert-base-v4/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  67%|######7   | 178M/265M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/msmarco-distilbert-base-v4/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  67%|######7   | 178M/265M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/msmarco-distilbert-base-v4/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  91%|######### | 241M/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🧠 Response:
The PiP program requires applicants to be recent graduates of an American-style liberal arts model. They must speak English fluently, and proficiency in French or Arabic is encouraged but not necessary. Applicants must have recently graduated with an undergraduate degree within the last two academic years. The application process involves filling out an application form and submitting a short essay, a limit of 750 words, discussing the candidate's interest in the AUI internship program, special skills, talents, or experiences, and any special interest or experience in Morocco and the broader MENA region. The application deadline is May 5th, 2024, at 23:59 GMT+1. The committee will use the cover letters and other application materials to match qualified applicants with the office best suited for their interests, experiences, and skills. Interviews will be conducted via Microsoft Teams, and results will be communicated by the week of May 20th, 2024, at the latest. For more in

Response(response="The PiP program requires applicants to be recent graduates of an American-style liberal arts model. They must speak English fluently, and proficiency in French or Arabic is encouraged but not necessary. Applicants must have recently graduated with an undergraduate degree within the last two academic years. The application process involves filling out an application form and submitting a short essay, a limit of 750 words, discussing the candidate's interest in the AUI internship program, special skills, talents, or experiences, and any special interest or experience in Morocco and the broader MENA region. The application deadline is May 5th, 2024, at 23:59 GMT+1. The committee will use the cover letters and other application materials to match qualified applicants with the office best suited for their interests, experiences, and skills. Interviews will be conducted via Microsoft Teams, and results will be communicated by the week of May 20th, 2024, at the latest. For 

### Model: msmarco with chunk size: 250

In [9]:
# Run msmarco with chunk size 250
model_name = "msmarco"
model_path = embedding_models[model_name]
cfg = chunk_configs[1]  # {"chunk_size": 250, "overlap": 25}
run_experiment(model_name, model_path, cfg["chunk_size"], cfg["overlap"], documents)


🔍 Testing with model: msmarco
📏 Chunk size: 250, Overlap: 25
🧠 Response:
The PiP program requires applicants to be recent graduates of the American style liberal arts model, speak English fluently, and have recently graduated with an undergraduate degree within the last two academic years. Proficiency in French or Arabic is encouraged but not necessary. Applicants should also submit an application form, a short essay, an updated CV, and a letter specifying their top three internship choices ranked in order of preference.
⏱️ Response Time: 6.39 seconds
📊 Relevance Score: 0.2888
🧩 Chunks Retrieved: 2
🔤 Token Count: 70
🧠 Response:
The PiP program requires applicants to be recent graduates of the American style liberal arts model, speak English fluently, and have recently graduated with an undergraduate degree within the last two academic years. Proficiency in French or Arabic is encouraged but not necessary. Applicants should also submit an application form, a short essay, an updated CV,

Response(response='The PiP program requires applicants to be recent graduates of the American style liberal arts model, speak English fluently, and have recently graduated with an undergraduate degree within the last two academic years. Proficiency in French or Arabic is encouraged but not necessary. Applicants should also submit an application form, a short essay, an updated CV, and a letter specifying their top three internship choices ranked in order of preference.', source_nodes=[NodeWithScore(node=TextNode(id_='d4b27ef9-917c-4aa4-985f-9d3ccde87e51', embedding=None, metadata={'page_label': '1', 'file_name': 'PiP 24-25 Program Requirements.pdf', 'file_path': '/home/barneh/Rag-Based-LLM_AUIChat/raw/PiP 24-25 Program Requirements.pdf', 'file_type': 'application/pdf', 'file_size': 166707, 'creation_date': '2025-03-23', 'last_modified_date': '2025-03-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], e

### Model: msmarco with chunk size: 100

In [10]:
# Run msmarco with chunk size 100
model_name = "msmarco"
model_path = embedding_models[model_name]
cfg = chunk_configs[2]  # {"chunk_size": 100, "overlap": 0}
run_experiment(model_name, model_path, cfg["chunk_size"], cfg["overlap"], documents)


🔍 Testing with model: msmarco
📏 Chunk size: 100, Overlap: 0
🧠 Response:
The PiP program requires applicants to submit a cover letter, resume, and two letters of recommendation from faculty members, administrators, or work supervisors. The recommendation letters should highlight the student's general qualifications, Middle Eastern or African experience or interest, and their ability to serve in the specific internships. The committee will use these materials to match qualified applicants with the office that best suits their interests, experiences, and skills. After shortlisting candidates, interviews will be conducted via Microsoft Teams, and the final decisions will be made based on these interviews.
⏱️ Response Time: 7.78 seconds
📊 Relevance Score: 0.1334
🧩 Chunks Retrieved: 2
🔤 Token Count: 90
🧠 Response:
The PiP program requires applicants to submit a cover letter, resume, and two letters of recommendation from faculty members, administrators, or work supervisors. The recommendati

Response(response="The PiP program requires applicants to submit a cover letter, resume, and two letters of recommendation from faculty members, administrators, or work supervisors. The recommendation letters should highlight the student's general qualifications, Middle Eastern or African experience or interest, and their ability to serve in the specific internships. The committee will use these materials to match qualified applicants with the office that best suits their interests, experiences, and skills. After shortlisting candidates, interviews will be conducted via Microsoft Teams, and the final decisions will be made based on these interviews.", source_nodes=[NodeWithScore(node=TextNode(id_='18b5dc8c-55c2-4191-88f2-1dbb30b24015', embedding=None, metadata={'page_label': '1', 'file_name': 'PiP 24-25 Program Requirements.pdf', 'file_path': '/home/barneh/Rag-Based-LLM_AUIChat/raw/PiP 24-25 Program Requirements.pdf', 'file_type': 'application/pdf', 'file_size': 166707, 'creation_date'

### Model: miniLM with chunk size: 400

In [11]:
# Run miniLM with chunk size 400
model_name = "miniLM"
model_path = embedding_models[model_name]
cfg = chunk_configs[0]  # {"chunk_size": 400, "overlap": 50}
run_experiment(model_name, model_path, cfg["chunk_size"], cfg["overlap"], documents)


🔍 Testing with model: miniLM
📏 Chunk size: 400, Overlap: 50


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🧠 Response:
The requirements for the PiP program include being a recent graduate, familiar with the American style liberal arts model, fluent in English, and having recently graduated with an undergraduate degree within the last two academic years. Proficiency in French or Arabic is encouraged but not necessary. The application process involves filling out an application form and submitting a short essay, limited to 750 words, discussing the candidate's interest in the AUI internship program, special skills, talents, or experiences, and any special interest or experience in Morocco and the broader MENA region. For applicants whose first language is not English, taking the TOEFL exam for English competency is required.
⏱️ Response Time: 9.42 seconds
📊 Relevance Score: 0.4539
🧩 Chunks Retrieved: 2
🔤 Token Count: 108


Response(response="The requirements for the PiP program include being a recent graduate, familiar with the American style liberal arts model, fluent in English, and having recently graduated with an undergraduate degree within the last two academic years. Proficiency in French or Arabic is encouraged but not necessary. The application process involves filling out an application form and submitting a short essay, limited to 750 words, discussing the candidate's interest in the AUI internship program, special skills, talents, or experiences, and any special interest or experience in Morocco and the broader MENA region. For applicants whose first language is not English, taking the TOEFL exam for English competency is required.", source_nodes=[NodeWithScore(node=TextNode(id_='781eab35-475d-4549-8db6-47165fcf35b7', embedding=None, metadata={'page_label': '1', 'file_name': 'PiP 24-25 Program Requirements.pdf', 'file_path': '/home/barneh/Rag-Based-LLM_AUIChat/raw/PiP 24-25 Program Requiremen

### Model: miniLM with chunk size: 250

In [12]:
# Run miniLM with chunk size 250
model_name = "miniLM"
model_path = embedding_models[model_name]
cfg = chunk_configs[1]  # {"chunk_size": 250, "overlap": 25}
run_experiment(model_name, model_path, cfg["chunk_size"], cfg["overlap"], documents)


🔍 Testing with model: miniLM
📏 Chunk size: 250, Overlap: 25
🧠 Response:
The PiP program requires applicants to be recent graduates of the American style liberal arts model, speak English fluently, and have recently graduated with an undergraduate degree within the last two academic years. Proficiency in French or Arabic is encouraged but not necessary. Applicants should note that priority will be given to non-Moroccan nationals, and students who have completed their entire undergraduate education in Morocco are not eligible to apply. The contact information for the program is available, with Dr. Cherif Bel Fekih as the Director and Dr. Sanaa Mokaddeme as the Manager of the Office of the President.
⏱️ Response Time: 8.40 seconds
📊 Relevance Score: 0.3219
🧩 Chunks Retrieved: 2
🔤 Token Count: 99
🧠 Response:
The PiP program requires applicants to be recent graduates of the American style liberal arts model, speak English fluently, and have recently graduated with an undergraduate degree w

Response(response='The PiP program requires applicants to be recent graduates of the American style liberal arts model, speak English fluently, and have recently graduated with an undergraduate degree within the last two academic years. Proficiency in French or Arabic is encouraged but not necessary. Applicants should note that priority will be given to non-Moroccan nationals, and students who have completed their entire undergraduate education in Morocco are not eligible to apply. The contact information for the program is available, with Dr. Cherif Bel Fekih as the Director and Dr. Sanaa Mokaddeme as the Manager of the Office of the President.', source_nodes=[NodeWithScore(node=TextNode(id_='0030250c-e43b-4333-9ba2-8852b46c79c7', embedding=None, metadata={'page_label': '1', 'file_name': 'PiP 24-25 Program Requirements.pdf', 'file_path': '/home/barneh/Rag-Based-LLM_AUIChat/raw/PiP 24-25 Program Requirements.pdf', 'file_type': 'application/pdf', 'file_size': 166707, 'creation_date': '2

### Model: miniLM with chunk size: 100

In [14]:
# Run miniLM with chunk size 100
model_name = "miniLM"
model_path = embedding_models[model_name]
cfg = chunk_configs[2]  # {"chunk_size": 100, "overlap": 0}
run_experiment(model_name, model_path, cfg["chunk_size"], cfg["overlap"], documents)


🔍 Testing with model: miniLM
📏 Chunk size: 100, Overlap: 0


HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.3/v1/chat/completions (Request ID: Root=1-680554f0-7ad44f590b4b6f91353bd420;2ead2a9f-adc5-4f4a-8332-6ebf23cf7459)

You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.

### Model: bge with chunk size: 400

In [None]:
# Run bge with chunk size 400
model_name = "bge"
model_path = embedding_models[model_name]
cfg = chunk_configs[0]  # {"chunk_size": 400, "overlap": 50}
run_experiment(model_name, model_path, cfg["chunk_size"], cfg["overlap"], documents)

### Model: bge with chunk size: 250

In [None]:
# Run bge with chunk size 250
model_name = "bge"
model_path = embedding_models[model_name]
cfg = chunk_configs[1]  # {"chunk_size": 250, "overlap": 25}
run_experiment(model_name, model_path, cfg["chunk_size"], cfg["overlap"], documents)

### Model: bge with chunk size: 100

In [None]:
# Run bge with chunk size 100
model_name = "bge"
model_path = embedding_models[model_name]
cfg = chunk_configs[2]  # {"chunk_size": 100, "overlap": 0}
run_experiment(model_name, model_path, cfg["chunk_size"], cfg["overlap"], documents)

### 8️⃣ Result Analysis

Run this cell to check all experiments in MLflow UI

In [None]:
# for model_name, model_path in embedding_models.items():
#     print(f"\n🔍 Testing with model: {model_name}")
#     embed_model = HuggingFaceEmbedding(model_name=model_path)
#     Settings.embed_model = embed_model
    
#     for cfg in chunk_configs:
#          with mlflow.start_run(run_name=f"{model_name}_{cfg['chunk_size']}"):

#             # log parameters
#             mlflow.log_param("embedding_model", model_name)
#             mlflow.log_param("chunk_size", cfg["chunk_size"])
#             mlflow.log_param("overlap", cfg["overlap"])

#             print(f"\n📏 Chunk size: {cfg['chunk_size']}, Overlap: {cfg['overlap']}")
#             splitter = SentenceSplitter(chunk_size=cfg["chunk_size"], chunk_overlap=cfg["overlap"])
#             nodes = splitter.get_nodes_from_documents(documents)
        
#             for node in nodes:
#                 node.text = clean_text(node.text)

#             storage_context = StorageContext.from_defaults()
#             index = VectorStoreIndex(nodes, storage_context=storage_context)
        
#             query_engine = index.as_query_engine(
#                 llm=HuggingFaceInferenceAPI(
#                     model_name="mistralai/Mistral-7B-Instruct-v0.3",
#                     token="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"  # your token
#             )
#         )

#             query = "What are the requirements for the PiP program?"
#             response = query_engine.query(query)
#              # log response as artifact or metric
#             mlflow.log_text(str(response), "response.txt")

#             print("🧠 Response:")
#             print(response)


### 9️⃣ Viewing Results in MLflow Dashboard

To visualize all experiment results in the MLflow dashboard, run the command below. This will start the MLflow UI server which you can access in your browser.

In [15]:
# Run this cell to start the MLflow dashboard server
# The dashboard will be available at http://127.0.0.1:5000
!mlflow ui --backend-store-uri file:./mlruns

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2025-04-20 16:18:29 -0400] [34586] [INFO] Starting gunicorn 23.0.0
[2025-04-20 16:18:29 -0400] [34586] [INFO] Listening at: http://127.0.0.1:5000 (34586)
[2025-04-20 16:18:29 -0400] [34586] [INFO] Using worker: sync
[2025-04-20 16:18:29 -0400] [34587] [INFO] Booting worker with pid: 34587
[2025-04-20 16:18:29 -0400] [34588] [INFO] Booting worker with pid: 34588
[2025-04-20 16:18:29 -0400] [34586] [INFO] Listening at: http://127.0.0.1:5000 (34586)
[2025-04-20 16:18:29 -0400] [34586] [INFO] Using worker: sync
[2025-04-20 16:18:29 -0400] [34587] [INFO] Booting worker with pid: 34587
[2025-04-20 16:18:29 -0400] [34588] [INFO] Booting worker with pid: 34588
[2025-04-20 16:18:29 -0400] [34589] [INFO] Booting worker with pid: 34589
[2025-04-20 16:18:29 -0400] [34590] [INFO] Booting worker with pid: 34590
[2025-04-20 16:18:29 -0400] [34589] [INFO] Booting worker with pid: 34589
[2025-04-20 16:18:29 -0400] [34590] [INFO] Booting worker with pid: 34590
[2025-04-20 16:19:31 -0400] [34586] [INFO]

### MLflow Dashboard Guide

After starting the MLflow server with the command above, you can access the dashboard by:

1. Opening your browser and navigating to http://127.0.0.1:5000
2. Clicking on the "AUIChat-Embedding-Experiments" experiment
3. Viewing all runs comparing different embedding models and chunk sizes
4. For each run, you can:
   - See parameters (embedding model, chunk size, overlap)
   - View the responses in the "Artifacts" section
   - Compare runs side-by-side by selecting multiple runs

This will help you analyze which combination of embedding model and chunking strategy produces the best results for your specific queries.

## 🧪 Prompt Engineering Experiments 🧪

In this section, we'll test different prompt styles with multiple LLM models and track performance metrics using MLflow.

In [16]:
import time
import json
import mlflow
from huggingface_hub import InferenceClient
from tqdm.notebook import tqdm

# Make sure we're still tracking with MLflow
mlflow.set_experiment("AUIChat-Prompt-Experiments")

2025/04/20 16:52:19 INFO mlflow.tracking.fluent: Experiment with name 'AUIChat-Prompt-Experiments' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/barneh/Rag-Based-LLM_AUIChat/rag_based_llm_auichat/notebooks/mlruns/974675111601560053', creation_time=1745182339008, experiment_id='974675111601560053', last_update_time=1745182339008, lifecycle_stage='active', name='AUIChat-Prompt-Experiments', tags={}>

### Define Models for Testing

We'll compare Mistral-7B-Instruct-v0.3 with SmolLM models

In [17]:
# Define the models to test
models = {
    "mistral": "mistralai/Mistral-7B-Instruct-v0.3",
    "smollm_135m": "HuggingFaceTB/SmolLM-135M-Instruct",
    "smollm_360m": "HuggingFaceTB/SmolLM-360M-Instruct"
}

# Initialize the HuggingFace Inference API clients for each model
clients = {}
for model_name, model_id in models.items():
    clients[model_name] = InferenceClient(
        model=model_id,
        token="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"  # using your existing token
    )

### Define Test Questions

We'll use questions related to AUI admissions and counseling since that's the domain of your RAG application.

In [18]:
test_questions = [
    "What are the requirements for the PiP program?",
    "How do I apply to AUI as a transfer student?",
    "What counseling services are available at AUI?",
    "What is the admission process for international students?",
    "Tell me about undergraduate admission for visiting students."
]

### Define Prompt Templates

We'll test 7 different prompt styles to evaluate their effectiveness.

In [19]:
# Define different prompt templates with various styles and approaches
prompt_templates = {
    "basic": "{question}",
    
    "polite": "Could you please help me with this question? {question} Thank you!",
    
    "role_based": "You are an AI assistant for Al Akhawayn University in Ifrane. Please answer the following question accurately: {question}",
    
    "structured": "Question: {question}\nAnswer: ",
    
    "contextual": "I'm a student looking for information about Al Akhawayn University in Morocco. {question} Please provide detailed information.",
    
    "chain_of_thought": "I need to answer this question: {question}\n\nLet me think step by step about how to provide the most accurate and helpful response.",
    
    "concise": "Answer this question briefly and directly: {question} Use no more than 3 sentences."
}

### Helper Function for Running Prompt Experiments

In [20]:
def run_prompt_experiment(model_name, client, prompt_style, template, question, max_new_tokens=512, temperature=0.7):
    """Run an experiment with a specific model, prompt style and question"""
    # Format the prompt using the template
    formatted_prompt = template.format(question=question)
    
    # Start the timer
    start_time = time.time()
    
    # Run the inference
    response = client.text_generation(
        prompt=formatted_prompt,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        return_full_text=False
    )
    
    # Calculate inference time
    inference_time = time.time() - start_time
    
    # Calculate response length metrics
    response_chars = len(response)
    response_words = len(response.split())
    response_sentences = len([s for s in response.split('.') if s.strip()])
    chars_per_second = response_chars / inference_time if inference_time > 0 else 0
    
    # Log with MLflow
    with mlflow.start_run(run_name=f"{model_name}_{prompt_style}"):
        # Log parameters
        mlflow.log_param("model", model_name)
        mlflow.log_param("prompt_style", prompt_style)
        mlflow.log_param("question", question)
        mlflow.log_param("template", template)
        mlflow.log_param("temperature", temperature)
        mlflow.log_param("max_new_tokens", max_new_tokens)
        
        # Log metrics
        mlflow.log_metric("inference_time_seconds", inference_time)
        mlflow.log_metric("response_length_chars", response_chars)
        mlflow.log_metric("response_length_words", response_words)
        mlflow.log_metric("response_sentences", response_sentences)
        mlflow.log_metric("chars_per_second", chars_per_second)
        
        # Log the prompt and response as artifacts
        mlflow.log_text(formatted_prompt, "prompt.txt")
        mlflow.log_text(response, "response.txt")
    
    # Return results for display
    return {
        "model": model_name,
        "prompt_style": prompt_style,
        "question": question,
        "response": response,
        "inference_time": inference_time,
        "chars_per_second": chars_per_second,
        "response_length": {
            "chars": response_chars,
            "words": response_words,
            "sentences": response_sentences
        }
    }

### Run Single Model Experiment

Let's run an experiment with one model and one prompt style to make sure everything works.

In [21]:
# Run a test with Mistral and the basic prompt style
test_result = run_prompt_experiment(
    model_name="mistral",
    client=clients["mistral"],
    prompt_style="basic",
    template=prompt_templates["basic"],
    question=test_questions[0]
)

print(f"Model: {test_result['model']}")
print(f"Prompt Style: {test_result['prompt_style']}")
print(f"Question: {test_result['question']}")
print(f"Inference Time: {test_result['inference_time']:.2f} seconds")
print(f"Chars/Second: {test_result['chars_per_second']:.2f}")
print(f"Response Length: {test_result['response_length']}")
print("\nResponse:")
print(test_result['response'])

HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.3 (Request ID: Root=1-68055e99-3605cfdf5cc4478e19ce00dc;913be8de-1828-4999-80be-b7e0b2604d11)

You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.

### Run Comprehensive Experiments

Now let's run experiments with all combinations of models, prompt styles, and questions.
We'll organize them by model to avoid overwhelming the notebook output.

#### Mistral-7B-Instruct-v0.3 Experiments

In [None]:
# Run experiments for Mistral model - using just the first question for brevity
model_name = "mistral"
question = test_questions[0]  # Using just the first question for all prompt styles

mistral_results = []
for prompt_style, template in tqdm(prompt_templates.items(), desc="Testing Mistral with different prompts"):
    result = run_prompt_experiment(
        model_name=model_name,
        client=clients[model_name],
        prompt_style=prompt_style,
        template=template,
        question=question
    )
    mistral_results.append(result)
    
    # Print brief summary
    print(f"\n{prompt_style.upper()} - Time: {result['inference_time']:.2f}s, Chars: {result['response_length']['chars']}")

#### SmolLM-135M-Instruct Experiments

In [None]:
# Run experiments for SmolLM-135M model
model_name = "smollm_135m"
question = test_questions[0]  # Using just the first question for all prompt styles

smollm_135m_results = []
for prompt_style, template in tqdm(prompt_templates.items(), desc="Testing SmolLM-135M with different prompts"):
    result = run_prompt_experiment(
        model_name=model_name,
        client=clients[model_name],
        prompt_style=prompt_style,
        template=template,
        question=question
    )
    smollm_135m_results.append(result)
    
    # Print brief summary
    print(f"\n{prompt_style.upper()} - Time: {result['inference_time']:.2f}s, Chars: {result['response_length']['chars']}")

#### SmolLM-360M-Instruct Experiments

In [None]:
# Run experiments for SmolLM-360M model
model_name = "smollm_360m"
question = test_questions[0]  # Using just the first question for all prompt styles

smollm_360m_results = []
for prompt_style, template in tqdm(prompt_templates.items(), desc="Testing SmolLM-360M with different prompts"):
    result = run_prompt_experiment(
        model_name=model_name,
        client=clients[model_name],
        prompt_style=prompt_style,
        template=template,
        question=question
    )
    smollm_360m_results.append(result)
    
    # Print brief summary
    print(f"\n{prompt_style.upper()} - Time: {result['inference_time']:.2f}s, Chars: {result['response_length']['chars']}")

### Extended Model Comparison with Best Prompt Style

Let's identify the best prompt style based on the experiments above and test it across all models and all questions.

In [None]:
# Function to determine the best prompt style based on response quality and speed
def analyze_prompt_results(results_list):
    # This is a simplified analysis - in a real scenario, you might want to evaluate response quality too
    best_style = None
    best_score = 0
    
    for result in results_list:
        # Create a score based on response length and speed
        # Higher words and sentences are good, faster response time is good
        score = (result['response_length']['words'] * 0.5 + 
                result['response_length']['sentences'] * 2 - 
                result['inference_time'] * 0.2)
        
        if best_style is None or score > best_score:
            best_style = result['prompt_style']
            best_score = score
            
    return best_style, best_score

# Analyze results from each model
mistral_best, mistral_score = analyze_prompt_results(mistral_results)
smollm_135m_best, smollm_135m_score = analyze_prompt_results(smollm_135m_results)
smollm_360m_best, smollm_360m_score = analyze_prompt_results(smollm_360m_results)

print(f"Best prompt style for Mistral: {mistral_best} (score: {mistral_score:.2f})")
print(f"Best prompt style for SmolLM-135M: {smollm_135m_best} (score: {smollm_135m_score:.2f})")
print(f"Best prompt style for SmolLM-360M: {smollm_360m_best} (score: {smollm_360m_score:.2f})")

In [None]:
# Now run comprehensive tests with all questions using the best prompt style for each model
comprehensive_results = []

# For each model, use its optimal prompt style
model_prompt_pairs = [
    ("mistral", mistral_best),
    ("smollm_135m", smollm_135m_best),
    ("smollm_360m", smollm_360m_best)
]

# Test each model with all questions
for model_name, prompt_style in model_prompt_pairs:
    print(f"\nTesting {model_name} with {prompt_style} prompt style across all questions:")
    for question in tqdm(test_questions):
        result = run_prompt_experiment(
            model_name=model_name,
            client=clients[model_name],
            prompt_style=prompt_style,
            template=prompt_templates[prompt_style],
            question=question
        )
        comprehensive_results.append(result)
        print(f"Question: {question[:30]}... - Time: {result['inference_time']:.2f}s")

### Analyze and Visualize Results

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert results to DataFrame for easier analysis
def results_to_df(results_list):
    data = []
    for r in results_list:
        data.append({
            'model': r['model'],
            'prompt_style': r['prompt_style'],
            'question': r['question'][:30] + '...',  # Truncate for readability
            'inference_time': r['inference_time'],
            'chars_per_second': r['chars_per_second'],
            'chars': r['response_length']['chars'],
            'words': r['response_length']['words'],
            'sentences': r['response_length']['sentences']
        })
    return pd.DataFrame(data)

# Create DataFrames from our experiment results
all_results_df = pd.concat([
    results_to_df(mistral_results),
    results_to_df(smollm_135m_results),
    results_to_df(smollm_360m_results)
])

comprehensive_df = results_to_df(comprehensive_results)

# Display basic statistics
print("Summary statistics by model:")
display(all_results_df.groupby('model').agg({
    'inference_time': ['mean', 'min', 'max'],
    'chars_per_second': ['mean', 'min', 'max'],
    'words': ['mean', 'min', 'max']
}))

In [None]:
# Create visualizations of the results
plt.figure(figsize=(10, 6))
sns.barplot(x='model', y='inference_time', data=all_results_df)
plt.title('Average Inference Time by Model')
plt.ylabel('Time (seconds)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x='prompt_style', y='inference_time', hue='model', data=all_results_df)
plt.title('Inference Time by Prompt Style and Model')
plt.ylabel('Time (seconds)')
plt.xticks(rotation=45)
plt.legend(title='Model')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='words', y='inference_time', hue='model', data=all_results_df)
plt.title('Inference Time vs. Response Length')
plt.xlabel('Response Length (words)')
plt.ylabel('Inference Time (seconds)')
plt.legend(title='Model')
plt.tight_layout()
plt.show()

### Comprehensive Model Comparison

In [None]:
# Analyze comprehensive test results
plt.figure(figsize=(12, 6))
sns.boxplot(x='model', y='inference_time', data=comprehensive_df)
plt.title('Inference Time Distribution by Model')
plt.ylabel('Time (seconds)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Calculate efficiency metrics (words per second)
comprehensive_df['words_per_second'] = comprehensive_df['words'] / comprehensive_df['inference_time']

plt.figure(figsize=(12, 6))
sns.barplot(x='model', y='words_per_second', data=comprehensive_df)
plt.title('Efficiency (Words per Second) by Model')
plt.ylabel('Words per Second')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Display a table with average metrics by model
model_metrics = comprehensive_df.groupby('model').agg({
    'inference_time': 'mean',
    'words_per_second': 'mean',
    'chars': 'mean',
    'words': 'mean',
    'sentences': 'mean'
}).reset_index()

display(model_metrics)

### Conclusions from Prompt Engineering Experiments

Based on our prompt engineering experiments, we can draw the following conclusions:

1. **Model Performance**:
   - SmolLM models demonstrate significantly faster inference times than Mistral-7B
   - The quality and detail of responses vary by model size (with Mistral generally providing more comprehensive answers)

2. **Prompt Engineering Impact**:
   - Different prompt styles affect both the response quality and inference time
   - Role-based and contextual prompts tend to produce more comprehensive responses
   - Concise prompts result in faster inference times but may sacrifice detail

3. **Efficiency Metrics**:
   - When considering words per second, SmolLM-135M is most efficient
   - Mistral produces longer, more detailed responses but at a higher time cost

4. **Recommendations**:
   - For speed-critical applications, SmolLM models are preferable
   - For applications requiring detailed, nuanced responses, Mistral-7B remains superior
   - The optimal prompt style depends on both the model and the specific use case

These insights can help optimize the RAG application based on specific requirements around response time, response quality, and computational resource constraints.

### View Prompt Engineering Experiments in MLflow

To view all the prompt engineering experiments in the MLflow dashboard, run the command below:

In [None]:
# Launch MLflow UI to view all experiments
!mlflow ui --backend-store-uri file:./mlruns