# Mistral

previous model (pm)

1) Setting Up Libraries

run only once this first

In [6]:
!pip install -q numpy
!pip install -q torch
!pip install -q tqdm
!pip install -q PyPDF2
!pip install -q langchain_text_splitters
!pip install -q sentence_transformers
!pip install -q hnswlib
!pip install -q transformers
!pip install -q pytesseract

In [8]:
# 1.  Standard libraries 
import os
import re
import pickle
import warnings
from typing import List, Dict 

# 2. Third Party libraries 
import numpy as np 
import torch
from tqdm.auto import tqdm
from PyPDF2 import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import hnswlib
from transformers import AutoModelForCausalLM, AutoTokenizer
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

warnings.filterwarnings('ignore')

print("All libraries correctly installed and imported")

All libraries correctly installed and imported


2) Class Config

In [9]:
class Config:
    # TODO: Sarebbe meglio usare percorsi relativi o .env in futuro
    PDF_FOLDER = r"C:\Users\gabri\Documents\Luiss\Luiss - Year Two\Advanced AI - LLM\Med M1\Medicinali Car"
    CACHE_DIR = r"C:\Users\gabri\Documents\Luiss\Luiss - Year Two\Advanced AI - LLM\Med M1\rag_cache"
    
    # Nomi dei file di cache
    EMBEDDINGS_FILE = os.path.join(CACHE_DIR, 'aifa_embeddings.npy')
    CHUNKS_FILE = os.path.join(CACHE_DIR, 'chunks.pkl')
    IDX_CACHE = os.path.join(CACHE_DIR, 'hnswlib_index.bin')

    # Configurazioni modello e chunking
    EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
    CHUNK_SIZE = 500
    CHUNK_OVERLAP = 50

    TOP_K = 10                  
    SIMILARITY_THRESHOLD = 0.3
    BATCH_SIZE = 32
    VERBOSE = True

config = Config()

# Crea le cartelle se non esistono
os.makedirs(config.CACHE_DIR, exist_ok=True)

3) Text and PDF management


In [10]:
def clean_text(text: str) -> str:
    """Cleans text by removing non-printable characters"""
    if not text or not isinstance(text, str):
        return ''
    cleaned = ''.join(c for c in text if c.isprintable() or c == '\n')
    cleaned = ' '.join(cleaned.split())
    return cleaned.strip()

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extracts clean text from a PDF."""
    try:
        reader = PdfReader(pdf_path)
        content = []
        for page in reader.pages:
            try:
                text = page.extract_text()
                if text and len(text.strip()) > 20:
                    cleaned = clean_text(text)
                    if cleaned:
                        content.append(cleaned)
            except:
                continue
        return '\n'.join(content).strip()
    except Exception as e:
        print(f'‚ö†Ô∏è Error reading PDF {pdf_path}: {e}')
        return ''

def extract_and_chunk_all_pdfs(config) -> List[Dict]:
    """Extracts and chunks all PDFs in the configured folder"""
    chunker = RecursiveCharacterTextSplitter(
        chunk_size=config.CHUNK_SIZE,
        chunk_overlap=config.CHUNK_OVERLAP
    )

    if not os.path.exists(config.PDF_FOLDER):
        print(f"‚ùå Error: The folder {config.PDF_FOLDER} does not exist.")
        return []

    pdf_files = [f for f in os.listdir(config.PDF_FOLDER) if f.endswith('.pdf')]
    print(f'üìö Found {len(pdf_files)} PDFs to process')
    all_chunks = []
    
    for pdf_file in tqdm(pdf_files, desc='üìÑ Processing PDFs'):
        file_path = os.path.join(config.PDF_FOLDER, pdf_file)
        raw_text = extract_text_from_pdf(file_path)

        if raw_text and len(raw_text.strip()) > 100:
            text_chunks = chunker.split_text(raw_text)
            for idx, chunk_text in enumerate(text_chunks):
                all_chunks.append({
                    'text': chunk_text,
                    'document': pdf_file,
                    'chunk_id': f'{pdf_file}_{idx}'
                })
    
    return all_chunks

3) Embeddings and Index Management

In [None]:
def load_or_create_index(chunks):
    """Upload or create the HNSWlib index and embeddings"""
    
    # Load embedding model (on CPU for compatibility)
    print(f'üì• Loading embedding model: {config.EMBEDDING_MODEL}')
    embedder = SentenceTransformer(config.EMBEDDING_MODEL)
    embedder.to('cpu')

    if os.path.exists(config.EMBEDDINGS_FILE) and os.path.exists(config.IDX_CACHE):
        print('üì¶ Loading index from cache...')
        embeddings = np.load(config.EMBEDDINGS_FILE)
        
        # Initialize empty index to then load it
        dim = embeddings.shape[1]
        index = hnswlib.Index(space='cosine', dim=dim)
        index.load_index(config.IDX_CACHE, max_elements=embeddings.shape[0])
    else:
        print('üîÑ Creating new embeddings...')
        documents = [chunk['text'] for chunk in chunks]
        embeddings = embedder.encode(documents, show_progress_bar=True, batch_size=config.BATCH_SIZE, device='cpu')
        
        # Create HNSWlib index
        dim = embeddings.shape[1]
        num_elements = len(documents)
        index = hnswlib.Index(space='cosine', dim=dim)
        index.init_index(max_elements=num_elements, ef_construction=200, M=16)
        index.add_items(embeddings, np.arange(num_elements))
        
        # Save
        np.save(config.EMBEDDINGS_FILE, embeddings)
        index.save_index(config.IDX_CACHE)
        
    return index, embedder

4) LLM Management (Mistral 7B)

In [12]:
def load_llm():
    """Upload the quantized Mistral 7B model"""
    print('Loading Mistral 7B...')
    
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        low_cpu_mem_usage=True
    )
    
    return model, tokenizer

def generate_response(model, tokenizer, prompt, max_tokens=512):
    """Generate a response using the loaded model"""
    messages = [{"role": "user", "content": prompt}]
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

5) RAG System and Main

In [14]:
def retrieve_context(query, index, chunks, embedder):
    """Retrieve the most relevant chunks for the query"""
    query_vec = embedder.encode([query], device='cpu')
    indices, distances = index.knn_query(query_vec, k=config.TOP_K)
    
    results = []
    for idx, dist in zip(indices[0], distances[0]):
        similarity = 1.0 - dist
        if similarity >= config.SIMILARITY_THRESHOLD:
            results.append(chunks[idx])
    
    return results

def rag_chat_system():
    """Main function that starts the system"""
    
    chunks = extract_and_chunk_all_pdfs(config)
    if not chunks:
        return
        
    index, embedder = load_or_create_index(chunks)
    
    print("System ready! (Retrieval-only mode for testing)")

    print('\n' + '='*50)
    print('üí¨ CHATBOT MEDICO AIFA - Type "exit" to quit')
    print('='*50)

    while True:
        query = input('\nüôã You: ').strip()
        if query.lower() in ['exit', 'quit']:
            break
            
        # Retrieval
        context_chunks = retrieve_context(query, index, chunks, embedder)
        
        if not context_chunks:
            print("ü§ñ Assistant: I couldn't find relevant information in the documents.")
            continue
            
        print(f"\n Found {len(context_chunks)} relevant documents.")
        
        best_match = context_chunks[0]
        print(f"üìÑ Source: {best_match['document']}")
        print(f"üìù Content: {best_match['text'][:300]}...")

if __name__ == "__main__":
    rag_chat_system()

üìö Found 47 PDFs to process


üìÑ Processing PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 47/47 [00:48<00:00,  1.03s/it]


üì• Loading embedding model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
üì¶ Loading index from cache...
System ready! (Retrieval-only mode for testing)

üí¨ CHATBOT MEDICO AIFA - Type "exit" to quit

 Found 10 relevant documents.
üìÑ Source: FI_000219_012745.pdf
üìù Content: indesiderati 5. Come conservare Tachipirina 6. Contenuto della confezione e a ltre informazioni 1. Che cos‚Äô √® Tachipirina e a cosa serve Tachipirina √® uno sciroppo per uso orale contenente il principio attivo paracetamolo che agisce r iducendo la febbre (antipiretico) e alleviando il dolore (analges...

 Found 4 relevant documents.
üìÑ Source: FI_000219_012745.pdf
üìù Content: sospensione √® dotata di un tappo di sicurezza. Le istruzioni per l‚Äôapertura e la chiusur a sono di seguito riportate: Per aprire : Per chiudere : premere avvitare a fondo contemporaneamente premendo girare ‚û¢ Dopo aver svitato il tappo, spingendolo verso il basso e contemporaneamente girando verso si...
