In [53]:
# Import necessary libraries
import os
from glob import glob
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document  # Import the Document class

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "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"


In [54]:
# Step 1: Load PDFs
def load_pdfs_from_directory(directory_path):
    raw_text = ""
    pdf_files = glob(os.path.join(directory_path, "*.pdf"))
    print(f"Found {len(pdf_files)} PDF files: {pdf_files}")
    
    for pdf_file in pdf_files:
        try:
            reader = PdfReader(pdf_file)
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    raw_text += text
        except Exception as e:
            print(f"Error reading {pdf_file}: {e}")
    
    return raw_text, pdf_files

# Directory containing PDFs
pdf_directory = "/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs"

# Step 1: Load and process PDFs
raw_text, pdf_files = load_pdfs_from_directory(pdf_directory)  # Unpack raw_text and file paths
print("Raw text loaded from all PDFs.")


Found 57 PDF files: ['/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(read) A comprehensive survey of digital twins - Applications, technologies and security challenges.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(gpt) Industry application of digital twin - from concept to implementation.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(read) Autonomous, context-aware, adaptive Digital Twins—State of the art and roadmap.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(read) Digital twins - An analysis framework and open issues.pdf', '/Users/adnanedrissielbouzidi/Library/Cl

In [55]:
def split_text(raw_text, chunk_size=1000, chunk_overlap=200):
    """
    Split raw text into smaller chunks for efficient processing.
    """
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    return text_splitter.split_text(raw_text)

# Step 2: Split the text into chunks
text_chunks = split_text(raw_text, chunk_size=1500, chunk_overlap=200)
print(f"Text split into {len(text_chunks)} chunks.")

# Display the first few chunks for review
for i, chunk in enumerate(text_chunks[:5]):  # Adjust the number to display more chunks if needed
    print(f"\nChunk {i+1}:\n{'-'*20}\n{chunk}\n")

Created a chunk of size 1507, which is longer than the specified 1500
Created a chunk of size 1698, which is longer than the specified 1500


Text split into 4776 chunks.

Chunk 1:
--------------------
Journal of Systems Architecture 151 (2024) 103120
Available online 27 March 2024
1383-7621/© 2024 Elsevier B.V. All rights reserved.A comprehensive survey of digital twins: Applications, technologies and 
security challenges 
Sekione Reward Jeremiaha, Abir El Azzaouib, Neal N. Xiongc, Jong Hyuk Parkb,* 
aDepartment of Electrical and Information Engineering, Seoul National University of Science and Technology, 232 Gongneung-ro, Nowon-gu, Seoul, 01811, South Korea 
bDepartment of Computer Science and Engineering, Seoul National University of Science and Technology, 232 Gongneung-ro, Nowon-gu, Seoul, 01811, South Korea 
cSchool of Resources Engineering, Xi’an University of Architecture and Technology, Xi’an, Shaanxi, China   
ARTICLE INFO  
Keywords: 
Digital twin 
Virtual twin 
Digital twin security 
Digital twin network 
Digital twin modeling 
DT enabling technologies ABSTRACT  
Alongside advancements in Artificial Intelligence

In [56]:
# Step 3: Create FAISS vector store
def create_faiss_vectorstore(text_chunks, pdf_files):
    """
    Create a FAISS vector store with metadata linking chunks to their source files.
    """
    embeddings = OpenAIEmbeddings()
    
    # Evenly distribute chunks across files
    total_chunks = len(text_chunks)
    num_files = len(pdf_files)
    chunks_per_file = total_chunks // num_files
    leftover_chunks = total_chunks % num_files

    chunk_to_file_map = []
    start = 0
    for i, pdf_file in enumerate(pdf_files):
        end = start + chunks_per_file + (1 if i < leftover_chunks else 0)
        chunk_to_file_map.extend([pdf_file] * (end - start))
        start = end

    documents = [
        Document(page_content=chunk, metadata={"source": chunk_to_file_map[i]})
        for i, chunk in enumerate(text_chunks)
    ]

    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore


vectorstore = create_faiss_vectorstore(text_chunks, pdf_files)
print("FAISS vector store created successfully.")

FAISS vector store created successfully.


In [57]:
# Step 4: Query the vector store
def answer_query(vectorstore, query, num_results=10):
    """
    Perform a similarity search on the vector store and answer the query.
    """
    retriever = vectorstore.as_retriever()
    retriever.search_kwargs['k'] = num_results  # Retrieve more results for broader context
    qa_chain = RetrievalQA.from_chain_type(
        llm=OpenAI(),
        retriever=retriever,
        return_source_documents=True
    )
    result = qa_chain.invoke({"query": query})
    
    # Log the PDFs contributing to the result
    sources = result["source_documents"]
    pdf_contributions = {doc.metadata['source'] for doc in sources}
    print(f"Contributing PDFs: {pdf_contributions}")
    
    return result["result"], sources

In its current form, your code successfully splits text into chunks, creates embeddings, and stores them in a FAISS vector store. However, the FAISS vector store constructed is based on the global text extracted from all the PDFs concatenated together, so it should theoretically have a "global" knowledge base of all the PDFs.

In [58]:
# Example queries
queries = [
    "Who wrote the paper : The Role of AI in Warehouse Digital Twins: Literature Review",
    "What are the requirements/conditions to say that a system is a digital twin?",
]

for query in queries:
    print(f"\nQuery: {query}")
    answer, sources = answer_query(vectorstore, query)
    print(f"Answer: {answer}")
    print("Source Documents:")
    for source in sources:
        print(f"- {source.metadata.get('source', 'Unknown')}")


Query: Who wrote the paper : The Role of AI in Warehouse Digital Twins: Literature Review
Contributing PDFs: {'/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(gpt) State of the Art and Future Directions of Digital Twins for Production Logistics - A Systematic Literature Review.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(read) The Role of AI in Warehouse Digital Twins.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(read) Digital Twin Enabling Technologies Challenges and Open Research.pdf'}
Answer:  Adnane Drissi Elbouzidi, Abdessamad Ait El Cadi, Robert Pellerin, Samir Lamouri, Estefania Tobon Valencia, and Marie-Jane Bélanger.
Source Documents:
- /Users/adnanedrissielbouzi

In [59]:
# Answer queries
queries = [
    "what are the components of a digital twin ?"
]

for query in queries:
    print(f"\nQuery: {query}")
    answer, sources = answer_query(vectorstore, query)
    print(f"Answer: {answer}")
    print("Source Documents:")
    for source in sources:
        print(f"- {source.metadata.get('source', 'Unknown')}")


Query: what are the components of a digital twin ?
Contributing PDFs: {'/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(gpt) Digital Twin applications toward Industry 4.0 A Review.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(gpt) Production logistics digital twins - Research profiling, application, challenges and opportunities.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(read) Digital twins - An analysis framework and open issues.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(read) A Comprehensive Review of Digital Twin from the Perspective of Total Process - Data,

In [60]:
# Answer queries
queries = [
    "what are the word the most associated with 'digital twin' in the articles with a score of the link probability?"
]

for query in queries:
    print(f"\nQuery: {query}")
    answer, sources = answer_query(vectorstore, query)
    print(f"Answer: {answer}")
    print("Source Documents:")
    for source in sources:
        print(f"- {source.metadata.get('source', 'Unknown')}")


Query: what are the word the most associated with 'digital twin' in the articles with a score of the link probability?
Contributing PDFs: {'/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(gpt) A Bibliometric Analysis of Digital Twin in the Supply Chain.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(gpt) Decision support in productive processes through DES and ABS in the Digital Twin era a systematic literature review.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(gpt) State of the Art and Future Directions of Digital Twins for Production Logistics - A Systematic Literature Review.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@squa

In [61]:
# Answer queries
queries = [
    "how can i update my data transmition capabilities in my warehouse DT? give examples"
]

for query in queries:
    print(f"\nQuery: {query}")
    answer, sources = answer_query(vectorstore, query)
    print(f"Answer: {answer}")
    print("Source Documents:")
    for source in sources:
        print(f"- {source.metadata.get('source', 'Unknown')}")


Query: how can i update my data transmition capabilities in my warehouse DT? give examples
Contributing PDFs: {'/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(gpt) Enabling technologies and tools for digital twin.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(gpt) State of the Art and Future Directions of Digital Twins for Production Logistics - A Systematic Literature Review.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(read) Digital Twins - A Maturity Model for Their Classification and Evaluation.pdf', '/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs/(read) The Role of AI i

In [62]:
# Answer queries
queries = [
    "hey girl hey"
]

for query in queries:
    print(f"\nQuery: {query}")
    answer, sources = answer_query(vectorstore, query)
    print(f"Answer: {answer}")
    print("Source Documents:")
    for source in sources:
        print(f"- {source.metadata.get('source', 'Unknown')}")


Query: hey girl hey


BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens, however you requested 5367 tokens (5111 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [50]:
import os
import openai
import dotenv
from glob import glob
from PyPDF2 import PdfReader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document

dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

PDF_DIR = "/Users/adnanedrissielbouzidi/Library/CloudStorage/GoogleDrive-adnane.drissielbouzidi@square-management.com/Mon Drive/onboarding consultant/DT_PDFs" # Folder with your PDFs
faiss_path = "faiss_store"

all_docs = []

pdf_files = glob(os.path.join(PDF_DIR, "*.pdf"))
for pdf_file in pdf_files:
    reader = PdfReader(pdf_file)
    for page in reader.pages:
        text = page.extract_text()
        if text:
            # Treat each page as a Document. 
            all_docs.append(Document(page_content=text, metadata={"source": pdf_file}))

# Embed the documents just once
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(all_docs, embeddings)

# Save so you never re-embed docs again
vectorstore.save_local(faiss_path)
print("FAISS store built and saved.")


RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for text-embedding-ada-002 in organization org-1n1fcXirpWeAsbAzsiLJbNJZ on tokens per min (TPM): Limit 1000000, Requested 1220938. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [30]:
FAISS_PATH = ".streamlit/faiss_store"
vectorstore.save_local(FAISS_PATH)


comment faire :
prendre chaqu article (df) demander une sythese a chatgpt selon unnabstracy cours par rapport auw reccomendation que j'ai envie de donner aux utilisateurs:
ex : pour 15 artciles ... je vais avoir 15* resumes 
donner ma base de connaissaices de 15 tests pas tres longs 
et puis en fonction des reponse de l'utilisateurs * les reponse de mon repondeurs.
donner 

chaque fois dans le nouveau prompt on ajoute l'hisotrique de la reponse précedente. 
extraire et synthetiser les bonnes informations avec les reccomnedation les plus interessantes.


construire une base de connaisanse JSON à base des articles et puis le transformer sous format de markdown.

Faut evider de donner les informations en vrac a mon gpt pour avoir des réponses plus pertinentes.
il y a un travail préparatoire à faire.

calculer le nombre de token (limiter le nombre).

deepseek pour la boite chinoise (moins cher pour les tokens)

tester claude (limide de token quotidienne)

In [16]:
import openai
client = openai.OpenAI(api_key="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")
models = client.models.list()
print([model.id for model in models.data])

['gpt-4o-audio-preview-2024-10-01', 'gpt-4o-mini-audio-preview', 'gpt-4o-2024-08-06', 'gpt-4o-mini-audio-preview-2024-12-17', 'gpt-4o-mini-realtime-preview', 'dall-e-2', 'gpt-3.5-turbo', 'o1-preview-2024-09-12', 'gpt-3.5-turbo-0125', 'o1-preview', 'gpt-3.5-turbo-instruct', 'babbage-002', 'o1-mini-2024-09-12', 'whisper-1', 'dall-e-3', 'gpt-4o-realtime-preview-2024-10-01', 'gpt-4-1106-preview', 'omni-moderation-latest', 'omni-moderation-2024-09-26', 'tts-1-hd-1106', 'gpt-4', 'gpt-4-0613', 'tts-1-hd', 'davinci-002', 'gpt-4o-2024-05-13', 'text-embedding-ada-002', 'gpt-4-turbo', 'tts-1', 'tts-1-1106', 'gpt-3.5-turbo-instruct-0914', 'gpt-4-turbo-preview', 'gpt-4o-mini-realtime-preview-2024-12-17', 'gpt-4o-audio-preview', 'text-embedding-3-small', 'gpt-4o-mini-2024-07-18', 'gpt-4o-mini', 'gpt-4-turbo-2024-04-09', 'gpt-3.5-turbo-1106', 'gpt-3.5-turbo-16k', 'gpt-4o-audio-preview-2024-12-17', 'gpt-4o-2024-11-20', 'gpt-4o-realtime-preview-2024-12-17', 'chatgpt-4o-latest', 'gpt-4o-realtime-preview