In [1]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
import re
import mlflow






In [2]:
mlflow.set_tracking_uri("file:./mlruns")  # local directory
mlflow.set_experiment("AUIChat-Embedding-Experiments")


2025/03/23 18:33:30 INFO mlflow.tracking.fluent: Experiment with name 'AUIChat-Embedding-Experiments' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/barneh/Rag-Based-LLM_AUIChat/rag_based_llm_auichat/notebooks/mlruns/180508001129329925', creation_time=1742769210427, experiment_id='180508001129329925', last_update_time=1742769210427, lifecycle_stage='active', name='AUIChat-Embedding-Experiments', tags={}>

2️⃣ Utility Functions 🧹
Clean Text

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^a-zA-Z0-9,.!?;:\'\"()\[\]\s]', '', text)
    return text


3️⃣ Define Embedding Models to Compare 📐

In [4]:
embedding_models = {
    "msmarco": "sentence-transformers/msmarco-distilbert-base-v4",
    "miniLM": "sentence-transformers/all-MiniLM-L6-v2",
    "bge": "BAAI/bge-small-en-v1.5"
}

4️⃣ Define Chunking Strategies ✂️

In [5]:
chunk_configs = [
    {"chunk_size": 400, "overlap": 50},
    {"chunk_size": 250, "overlap": 25},
    {"chunk_size": 100, "overlap": 0}
]

5️⃣ Load & Preprocess Documents 📄

In [7]:
documents = SimpleDirectoryReader("/home/barneh/Rag-Based-LLM_AUIChat/rag_based_llm_auichat/data/raw").load_data()

6️⃣ Run Experiments Loop 🧪

In [10]:
for model_name, model_path in embedding_models.items():
    print(f"\n🔍 Testing with model: {model_name}")
    embed_model = HuggingFaceEmbedding(model_name=model_path)
    Settings.embed_model = embed_model
    
    for cfg in chunk_configs:
         with mlflow.start_run(run_name=f"{model_name}_{cfg['chunk_size']}"):

            # log parameters
            mlflow.log_param("embedding_model", model_name)
            mlflow.log_param("chunk_size", cfg["chunk_size"])
            mlflow.log_param("overlap", cfg["overlap"])

            print(f"\n📏 Chunk size: {cfg['chunk_size']}, Overlap: {cfg['overlap']}")
            splitter = SentenceSplitter(chunk_size=cfg["chunk_size"], chunk_overlap=cfg["overlap"])
            nodes = splitter.get_nodes_from_documents(documents)
        
            for node in nodes:
                node.text = clean_text(node.text)

            storage_context = StorageContext.from_defaults()
            index = VectorStoreIndex(nodes, storage_context=storage_context)
        
            query_engine = index.as_query_engine(
                llm=HuggingFaceInferenceAPI(
                    model_name="mistralai/Mistral-7B-Instruct-v0.3",
                    token="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"  # your token
            )
        )

            query = "What are the requirements for the PiP program?"
            response = query_engine.query(query)
             # log response as artifact or metric
            mlflow.log_text(str(response), "response.txt")

            print("🧠 Response:")
            print(response)



🔍 Testing with model: msmarco

📏 Chunk size: 400, Overlap: 50
🧠 Response:
The PiP program requires applicants to be recent graduates of an American-style liberal arts model. They must speak English fluently and have recently graduated with an undergraduate degree within the last two academic years. Proficiency in French or Arabic is encouraged but not necessary. Applicants should submit an application form, a short essay, an updated CV, a letter to the president of AUI, and two letters of recommendation from faculty members, administrators, or work supervisors. The essay should discuss the candidate's interest in the AUI internship program, any special skills, talents, or experiences that would help them contribute to the positions and the AUI community, and any special interest or experience the candidate has in Morocco and the broader MENA region. The essay should be limited to 750 words. The letters of recommendation should be submitted directly to oireaui.ma.

📏 Chunk size: 250, O

No sentence-transformers model found with name sentence-transformers/all-MiniLM-L6-v2. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


📏 Chunk size: 400, Overlap: 50


ConnectionError: (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 8c122c2f-e7ef-4f35-9087-1160e20d4016)')