#### Imports

In [27]:
from dotenv import load_dotenv
import os
from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
from pathlib import Path
from llama_index.core import SimpleDirectoryReader, DocumentSummaryIndex
from llama_index.core.node_parser import SentenceSplitter 
from llama_index.core.ingestion import IngestionPipeline
import pandas as pd
import numpy as np
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import asyncio
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio
import pickle

#### Keys

In [28]:
load_dotenv()
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

#### Open-source models (Mistral)

In [45]:
# LLM 
llm = MistralAI(api_key=MISTRAL_API_KEY, model="mistral-large-latest")

# # Embedings model
embed = MistralAI(api_key=MISTRAL_API_KEY, model="mistral-embed")

# Instruct model
instruct = MistralAI(api_key=MISTRAL_API_KEY, model="codestral-latest")

#### Data ingestion

In [30]:
# Custom class for documents
# nodes creations expects get_content method

class Document:
    def __init__(self, content, metadata):
        self.content = content
        self.metadata = metadata

    def get_content(self, metadata_mode=None):
        content = self.content
        return str(content)
    
    def get_metadata(self):
        metadata = self.metadata
        return str(metadata)

In [31]:
reader = SimpleDirectoryReader(input_dir="data") # For text files

data_folder_path = Path("data")

all_documents = []

for csv_file in data_folder_path.glob("*.csv"):
    df = pd.read_csv(csv_file, low_memory=False)
    file_name = re.search(r'[^\\]+(?=\.\w+$)', str(csv_file)).group()


    documents = df.apply(lambda row: Document(
        content=row.to_dict(),
        metadata={
            "table_name": file_name,
            "table_shape": df.shape
        }
    ), axis=1).tolist()

    # Add documents to the list
    all_documents.extend(documents)

In [32]:
len(all_documents)

1728833

In [33]:
type(all_documents[0].get_content())

str

In [39]:
# pipeline = IngestionPipeline(
#     transformations=[
#         MistralAIEmbedding()
#     ]
# )

# nodes = await pipeline.arun(documents=all_documents) # Single threaded approximate 48 hours to run

In [40]:
# nest_asyncio.apply()

# # Define batch size
# batch_size = 250  # You may need to tune this based on your memory constraints and API rate limits

# # Create batches
# batches = [all_documents[i: i + batch_size] for i in range(0, len(all_documents), batch_size)]

# async def process_documents():
#     # Create pipeline
#     pipeline = IngestionPipeline(
#         transformations=[
#             MistralAIEmbedding()
#         ]
#     )
    
#     # Process batches concurrently with controlled concurrency
#     all_nodes = []
#     semaphore = asyncio.Semaphore(10)  # Limit concurrent requests to avoid API rate limits
    
#     async def process_batch_with_semaphore(batch_idx, batch):
#         async with semaphore:
#             try:
#                 result = await pipeline.arun(documents=batch)
#                 return result
#             except Exception as e:
#                 print(f"Error processing batch {batch_idx}: {e}")
#                 return []
    
#     # Create tasks for all batches
#     tasks = [process_batch_with_semaphore(i, batch) for i, batch in enumerate(batches)]
    
#     # Process results as they complete
#     for result in await tqdm_asyncio.gather(*tasks, desc="Processing batches"):
#         all_nodes.extend(result)
    
#     return all_nodes

# # Run the processing
# nodes = await process_documents()
# print(f"Processed {len(nodes)} nodes")

In [47]:
model = MistralAI(model=embed)

ValidationError: 1 validation error for MistralAI
model
  Input should be a valid string [type=string_type, input_value=MistralAI(callback_manage...e, additional_kwargs={}), input_type=MistralAI]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type

In [48]:
nest_asyncio.apply()

batch_size = 250

batches = [all_documents[i: i + batch_size] for i in range(0, len(all_documents), batch_size)]

save_dir = Path("intermediate_embeddings")
save_dir.mkdir(exist_ok=True)

async def process_batch(batch):
    
    # Create pipeline for this batch
    pipeline = IngestionPipeline(
        transformations=[
            MistralAI(model=embed)
        ]
    )
    return await pipeline.arun(documents=batch)

async def main():
    nodes = []
    
    # Create semaphore to limit concurrent API calls
    semaphore = asyncio.Semaphore(3)  # Adjust based on API rate limits
    
    async def bounded_process_batch(batch_idx, batch):
        async with semaphore:
            try:
                result = await process_batch(batch)
                print(f"Batch {batch_idx}/{len(batches)} completed with {len(result)} nodes")
                return result
            except Exception as e:
                print(f"Error processing batch {batch_idx}: {e}")
                return []
    
    # Create tasks
    tasks = [bounded_process_batch(i, batch) for i, batch in enumerate(batches)]
    
    # Use as_completed to process results as they finish
    for i, future in enumerate(tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing batches")):
        result = await future
        nodes.extend(result)
        
        # Save intermediate results every 10 batches (adjust as needed)
        if i > 0 and i % 10 == 0:
            print(f"Saving intermediate results ({i}/{len(tasks)} batches processed)...")
            with open(save_dir / f"nodes_checkpoint_{i}.pkl", "wb") as f:
                pickle.dump(nodes, f)
    
    # Save final results
    with open(save_dir / "nodes_final.pkl", "wb") as f:
        pickle.dump(nodes, f)
    
    print(f"Processed {len(nodes)} nodes")
    return nodes


In [49]:
nodes = await main()

Processing batches:   0%|          | 0/6916 [00:00<?, ?it/s]

Error processing batch 1079: 1 validation error for MistralAI
model
  Input should be a valid string [type=string_type, input_value=MistralAI(callback_manage...e, additional_kwargs={}), input_type=MistralAI]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Error processing batch 1078: 1 validation error for MistralAI
model
  Input should be a valid string [type=string_type, input_value=MistralAI(callback_manage...e, additional_kwargs={}), input_type=MistralAI]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Error processing batch 1080: 1 validation error for MistralAI
model
  Input should be a valid string [type=string_type, input_value=MistralAI(callback_manage...e, additional_kwargs={}), input_type=MistralAI]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Error processing batch 1189: 1 validation error for MistralAI
model
  Input should be a valid string [type=string_type, input_value=M

Processing batches:  26%|██▌       | 1791/6916 [00:00<00:00, 5336.31it/s]

Saving intermediate results (10/6916 batches processed)...
Saving intermediate results (20/6916 batches processed)...
Saving intermediate results (30/6916 batches processed)...
Saving intermediate results (40/6916 batches processed)...
Saving intermediate results (50/6916 batches processed)...
Saving intermediate results (60/6916 batches processed)...
Saving intermediate results (70/6916 batches processed)...
Saving intermediate results (80/6916 batches processed)...
Saving intermediate results (90/6916 batches processed)...
Saving intermediate results (100/6916 batches processed)...
Saving intermediate results (110/6916 batches processed)...
Saving intermediate results (120/6916 batches processed)...
Saving intermediate results (130/6916 batches processed)...
Saving intermediate results (140/6916 batches processed)...
Saving intermediate results (150/6916 batches processed)...
Saving intermediate results (160/6916 batches processed)...
Saving intermediate results (170/6916 batches pro

Processing batches: 100%|██████████| 6916/6916 [00:00<00:00, 9757.35it/s] 


Saving intermediate results (3930/6916 batches processed)...
Saving intermediate results (3940/6916 batches processed)...
Saving intermediate results (3950/6916 batches processed)...
Saving intermediate results (3960/6916 batches processed)...
Saving intermediate results (3970/6916 batches processed)...
Saving intermediate results (3980/6916 batches processed)...
Saving intermediate results (3990/6916 batches processed)...
Saving intermediate results (4000/6916 batches processed)...
Saving intermediate results (4010/6916 batches processed)...
Saving intermediate results (4020/6916 batches processed)...
Saving intermediate results (4030/6916 batches processed)...
Saving intermediate results (4040/6916 batches processed)...
Saving intermediate results (4050/6916 batches processed)...
Saving intermediate results (4060/6916 batches processed)...
Saving intermediate results (4070/6916 batches processed)...
Saving intermediate results (4080/6916 batches processed)...
Saving intermediate resu

In [66]:
## Test with mistral documenations ressources

import nest_asyncio
import asyncio
from tqdm import tqdm
import pickle
from pathlib import Path
import os
from mistralai import Mistral
import concurrent.futures

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load environment variables
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
mistral_client = Mistral(api_key=MISTRAL_API_KEY)
model = "mistral-embed"

# Define batch size
batch_size = 250  # Adjust based on API constraints
save_dir = Path("intermediate_embeddings")
save_dir.mkdir(exist_ok=True)

# Function to extract content from documents for embedding
def get_document_texts(docs):
    return [doc.get_content() for doc in docs]

# Function to create embeddings using Mistral API
def create_embeddings_batch(batch_docs):
    try:
        batch_texts = get_document_texts(batch_docs)
        response = mistral_client.embeddings.create(
            model=embed,
            inputs=batch_texts,
        )
        
        # Create dictionary with document and its embedding
        results = []
        for i, doc in enumerate(batch_docs):
            embedding = response.data[i].embedding
            results.append({
                "document": doc,
                "embedding": embedding,
                "metadata": doc.get_metadata()
            })
        return results
    except Exception as e:
        print(f"Error creating embeddings: {e}")
        return []

async def main():
    # Create batches
    batches = [all_documents[i: i + batch_size] for i in range(0, len(all_documents), batch_size)]
    all_results = []
    
    # Use ThreadPoolExecutor for parallel processing
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        # Submit all batch processing tasks
        future_to_batch = {
            executor.submit(create_embeddings_batch, batch[i]): i 
            for i, batch in enumerate(batches)
        }
        
        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(future_to_batch), total=len(batches), desc="Processing batches"):
            batch_idx = future_to_batch[future]
            try:
                results = future.result()
                all_results.extend(results)
                print(f"Batch {batch_idx}/{len(batches)} completed with {len(results)} embeddings")
                
                # Save intermediate results every 10 batches
                if batch_idx > 0 and batch_idx % 10 == 0:
                    print(f"Saving intermediate results ({batch_idx}/{len(batches)} batches processed)...")
                    with open(save_dir / f"embeddings_checkpoint_{batch_idx}.pkl", "wb") as f:
                        pickle.dump(all_results, f)
            except Exception as e:
                print(f"Batch {batch_idx} generated an exception: {e}")
    
    # Save final results
    with open(save_dir / "embeddings_final.pkl", "wb") as f:
        pickle.dump(all_results, f)
    
    print(f"Created embeddings for {len(all_results)} documents")
    return all_results

# To run the main function:
embedded_docs = await main()

Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embeddings: 'Document' object is not iterable
Error creating embedding

IndexError: list index out of range

In [57]:
# Define the directory where the pickle files are saved
save_dir = Path("intermediate_embeddings")

# Read a specific checkpoint file
def read_checkpoint(checkpoint_number):
    checkpoint_path = save_dir / f"nodes_checkpoint_{checkpoint_number}.pkl"
    try:
        with open(checkpoint_path, "rb") as f:
            nodes = pickle.load(f)
        print(f"Loaded {len(nodes)} nodes from checkpoint {checkpoint_number}")
        return nodes
    except FileNotFoundError:
        print(f"Checkpoint file not found: {checkpoint_path}")
        return None
    except Exception as e:
        print(f"Error loading checkpoint {checkpoint_number}: {e}")
        return None

# Read the final results file
def read_final_results():
    final_path = save_dir / "nodes_final.pkl"
    try:
        with open(final_path, "rb") as f:
            nodes = pickle.load(f)
        print(f"Loaded {len(nodes)} nodes from final results")
        return nodes
    except FileNotFoundError:
        print(f"Final results file not found: {final_path}")
        return None
    except Exception as e:
        print(f"Error loading final results: {e}")
        return None

# Example usage:
# To read a specific checkpoint (e.g., checkpoint 20):
nodes_checkpoint_20 = read_checkpoint(20)

# To read the final results:
nodes_final = read_final_results()

Loaded 0 nodes from checkpoint 20
Loaded 0 nodes from final results


In [69]:

for batch, i in batches:
    print(batch[i].get_content())

ValueError: too many values to unpack (expected 2)