In [1]:
!pip install datasets
!pip install llama_index
!pip install sentence-transformers
!pip install datasets llama-index matplotlib numpy
!pip install --upgrade llama-index
!pip install datasets llama-index-core llama-index-embeddings-huggingface matplotlib numpy sentence-transformers

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2025.3.0-py3-none-any.whl (193 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2025.3.0


In [2]:
!pip install -U langchain-community
!pip install llama-index-llms-huggingface

Collecting packaging<25,>=23.2 (from langchain-core<1.0.0,>=0.3.56->langchain-community)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Using cached packaging-24.2-py3-none-any.whl (65 kB)
Installing collected packages: packaging
  Attempting uninstall: packaging
    Found existing installation: packaging 25.0
    Uninstalling packaging-25.0:
      Successfully uninstalled packaging-25.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.5 which is incompatible.[0m[31m
[0mSuccessfully installed packaging-24.2


In [3]:
import datasets
from datasets import load_dataset
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import matplotlib.pyplot as plt
import numpy as np
import os

In [4]:
# Configurer LlamaIndex pour utiliser un modèle d'embedding local
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Charger le jeu de données MNIST depuis Hugging Face
dataset = load_dataset("mnist")

# Préparer les données pour LlamaIndex
train_data = dataset['train']
test_data = dataset['test']



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# Créer des documents pour LlamaIndex basés sur les métadonnées
documents = []
for i, example in enumerate(train_data):
    label = example['label']
    # Créer un document avec le label comme contenu textuel
    doc = Document(
        text=f"Image of digit {label}",
        metadata={"index": i, "label": label}
    )
    documents.append(doc)

# Créer un index avec LlamaIndex
index = VectorStoreIndex.from_documents(documents)

# Explorer la structure des données
print(f"Nombre de documents indexés: {len(documents)}")
print(f"Nombre d'exemples d'entraînement: {len(train_data)}")
print(f"Nombre d'exemples de test: {len(test_data)}")



Nombre de documents indexés: 60000
Nombre d'exemples d'entraînement: 60000
Nombre d'exemples de test: 10000


In [6]:
# Fonction pour afficher une image MNIST
def display_image(example, index):
    image = np.array(example['image'])
    label = example['label']

    plt.figure(figsize=(4, 4))
    plt.imshow(image, cmap='gray')
    plt.title(f"Chiffre: {label}")
    plt.axis('off')
    plt.savefig(f'mnist_example_{index}.png')
    plt.close()

# Afficher les 5 premières images du jeu d'entraînement
for i in range(5):
    display_image(train_data[i], i)



In [7]:
# Prétraitement : normalisation des pixels
def normalize_images(example):
    example['image'] = np.array(example['image']) / 255.0
    return example

# Appliquer la normalisation au dataset
train_data_normalized = train_data.map(normalize_images)
test_data_normalized = test_data.map(normalize_images)



In [8]:
import os
import openai
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

openai.api_key = os.getenv("OPENAI_API_KEY")

In [13]:
# Vérifier les valeurs normalisées
from llama_index.llms.openai import OpenAI

llm = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

sample_image = np.array(train_data_normalized[0]['image'])
print(f"Valeurs min/max après normalisation: {sample_image.min():.2f}/{sample_image.max():.2f}")

# Exemple de filtrage avec LlamaIndex : rechercher les images du chiffre 5
query_engine = index.as_query_engine(llm=llm)
response = query_engine.query("Find images of digit 5")

Valeurs min/max après normalisation: 0.00/1.00


In [14]:
# Extraire les indices des documents correspondant au chiffre 5
fives_indices = []
for node in response.source_nodes:
    if node.metadata['label'] == 5:
        fives_indices.append(node.metadata['index'])

print(f"Nombre d'images du chiffre 5: {len(fives_indices)}")

# Sauvegarder les indices des images du chiffre 5 localement (optionnel)
with open("mnist_fives_only.txt", "w") as f:
    for idx in fives_indices:
        f.write(f"{idx}\n")

Nombre d'images du chiffre 5: 2
