In [8]:
import os
from langchain.document_loaders import PyPDFLoader, CSVLoader


def load_pdfs_and_csvs(pdf_folder, csv_folder):
    documents = []

    # Load all PDF files
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())

# Load all CSV files
    for csv_file in os.listdir(csv_folder):
         if csv_file.endswith(".csv"):
             csv_path = os.path.join(csv_folder, csv_file)
             try:
                 loader = CSVLoader(file_path=csv_path, encoding="utf-8")
                 documents.extend(loader.load())
             except UnicodeDecodeError:
                 print(f"Error decoding file: {csv_path}. Trying with a different encoding.")
                 loader = CSVLoader(file_path=csv_path, encoding="ISO-8859-1")
                 documents.extend(loader.load())

    return documents

# Load documents from folders
pdf_folder = "pdfs_temp"
csv_folder = "csv" 
documents = load_pdfs_and_csvs(pdf_folder, csv_folder)
# documents

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 68 0 (offset 0)
Ignoring wrong pointing object 70 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 74 0 (offset 0)
Ignoring wrong pointing object 99 0 (offset 0)
Ignoring wrong pointing object 125 0 (offset 0)
Ignoring wrong pointing object 218 0 (offset 0)
Ignoring wrong pointing object 232 0 (offset 0)


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=100,
)
 
# Split the documents into chunks
split_docs = text_splitter.split_documents(documents)
 
# Display the chunks
for idx, chunk in enumerate(split_docs):
    print(f"Chunk {idx + 1}:\n{chunk.page_content}\n{'-' * 50}")

Chunk 1:
Kontakt 
Technische Hochschule Bingen
Berlinstraße 109 
55411 Bingen am Rhein 
T. +49 6721 409-0
Zentrale Studienberatung
zsb@th-bingen.de 
T. +49 6721 409-386
Technische Hochschule Bingen
Studieren mit Aussicht
th-bingen.de
Technische Hochschule 
Bingen
Studieren mit Aussicht
Anreise mit Bus und Bahn
Aktuelle Fahrpläne finden Sie auf der Webseite des Rhein-Nahe 
Nahverkehrsverbunds unter www.rnn.info. Die Bushaltestelle am 
Campus heißt „TH Neubau“.
Anreise mit dem Pkw
Die Hochschule ist an die A60 und A61 angebunden und hat   
an der B9 eine eigene Abfahrt.
Rhein
Abfahrt 
Bingen-Ost
Autobahndreieck 
Nahetal
Abfahrt 
Bingen-Mitte
TH Bingen
A60
A61
B50
B9
Mainz
Koblenz
28.06.2020
--------------------------------------------------
Chunk 2:
Technische Hochschule 
Bingen
Seit 1897 bildet die Technische Hochschule Bingen in Ingenieur- 
und Naturwissenschaften aus. An der TH Bingen bieten wir 
Ihnen eine kompetente Lehre, einen starken Praxisbezug und 
enge Kooperationen mit Untern

## Initial Testing

In [4]:
from collections import defaultdict

pdf_page_count = defaultdict(int)

for doc in documents:
    source = doc.metadata['source']
    if source.endswith(".pdf"):
        pdf_name = source.split('/')[-1].rsplit('.', 1)[0]
        pdf_page_count[pdf_name] += 1

result = [{key: count} for key, count in pdf_page_count.items()]
# result

In [15]:
print("No. of chunks created including Csvs and Pdfs:", len(split_docs))

No. of chunks created including Csvs and Pdfs: 3502


In [7]:
#Embeddings
from sentence_transformers import SentenceTransformer
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # we can use other Hugging Face models
embedding_model = SentenceTransformer(model_name)

# Embedding the text chunks
embedded_chunks = []
for chunk in split_docs:
    embedding = embedding_model.encode(chunk.page_content, show_progress_bar=False)
    embedded_chunks.append({
        "embedding": embedding,
        "content": chunk.page_content,
        "metadata": chunk.metadata  # Includes info like page number
    })

#  Save or use embeddings
for idx, chunk in enumerate(embedded_chunks):
    print(f"Chunk {idx + 1}:")
    print("Content:", chunk["content"])
    print("Embedding Shape:", len(chunk["embedding"]))  # Embedding vector length
    print()

Chunk 1:
Content: Kontakt 
Technische Hochschule Bingen
Berlinstraße 109 
55411 Bingen am Rhein 
T. +49 6721 409-0
Zentrale Studienberatung
zsb@th-bingen.de 
T. +49 6721 409-386
Technische Hochschule Bingen
Studieren mit Aussicht
th-bingen.de
Technische Hochschule 
Bingen
Studieren mit Aussicht
Anreise mit Bus und Bahn
Aktuelle Fahrpläne finden Sie auf der Webseite des Rhein-Nahe 
Nahverkehrsverbunds unter www.rnn.info. Die Bushaltestelle am 
Campus heißt „TH Neubau“.
Anreise mit dem Pkw
Die Hochschule ist an die A60 und A61 angebunden und hat   
an der B9 eine eigene Abfahrt.
Rhein
Abfahrt 
Bingen-Ost
Autobahndreieck 
Nahetal
Abfahrt 
Bingen-Mitte
TH Bingen
A60
A61
B50
B9
Mainz
Koblenz
28.06.2020
Embedding Shape: 384

Chunk 2:
Content: Technische Hochschule 
Bingen
Seit 1897 bildet die Technische Hochschule Bingen in Ingenieur- 
und Naturwissenschaften aus. An der TH Bingen bieten wir 
Ihnen eine kompetente Lehre, einen starken Praxisbezug und 
enge Kooperationen mit Unternehmen. Die 

In [8]:
# displaying raw embeddings
for idx, chunk in enumerate(embedded_chunks):
    print(f"Chunk {idx + 1} Embedding:")
    print(chunk["embedding"])  # Display raw embedding vector
    print()


Chunk 1 Embedding:
[-3.27273272e-02  1.74827091e-02  4.65556011e-02 -8.80788043e-02
 -1.29291356e-01  4.65208143e-02 -5.39758801e-02  2.75742672e-02
 -3.10975090e-02 -1.69658232e-02  8.57135840e-03 -2.58589555e-02
 -6.73875436e-02 -4.48711775e-02 -7.97423795e-02 -4.18984815e-02
 -2.63432302e-02 -1.36711612e-01  5.26967831e-02 -6.11194372e-02
 -5.00144484e-03 -1.24983080e-02 -1.10180210e-02 -5.42017706e-02
  1.10566076e-02  2.61865631e-02 -2.68597882e-02  7.53113115e-03
  6.65740436e-03 -2.57822499e-02  1.08408192e-02  1.01403736e-01
 -5.12998588e-02  6.84722280e-03  6.58711120e-02 -6.08377568e-02
  1.57603361e-02 -3.96194570e-02 -5.03773540e-02 -1.27663848e-03
 -7.09066261e-03  4.24756184e-02 -4.24257293e-02  5.42353019e-02
 -4.62025590e-02  3.43750566e-02  4.49949782e-03 -1.81428250e-02
 -5.61417006e-02 -6.24527447e-02 -5.44577800e-02 -1.79641135e-02
  2.48118155e-02  3.78334709e-02 -3.09958830e-02 -1.87996924e-02
  5.09968167e-03  6.34530708e-02 -2.29866151e-02  1.84236448e-02
  5.04

In [10]:
import chromadb

# Initialize persistent client with a path
persistent_client = chromadb.PersistentClient(path="./chroma_db")

# Delete if exists and create new collection
try:
    persistent_client.delete_collection("text_chunks_collection")
except:
    pass

# Create fresh collection
collection = persistent_client.create_collection(name="text_chunks_collection")

# Store the embeddings
for idx, chunk in enumerate(embedded_chunks):
    collection.add(
        embeddings=[chunk["embedding"]],
        documents=[chunk["content"]],
        metadatas=[chunk["metadata"]],
        ids=[f"chunk_{idx}"]
    )

print(f"Stored {len(embedded_chunks)} chunks in ChromaDB!")

Stored 250 chunks in ChromaDB!


# Retrieval Example

In [36]:
question = "What is the workload in hours of the course artificial intelligence?"
question_embedding = embedding_model.encode(question)


results = collection.query(
    query_embeddings=[question_embedding],
    n_results=4,  # get top n results (which are input to LLM)
    include=['documents', 'metadatas', 'distances']  
    # based on cosine similarity
)

# Printing results with similarity scores
for idx, (document, metadata, distance) in enumerate(zip(results['documents'][0], results['metadatas'][0], results['distances'][0])):
    similarity_score = 1 - distance  # distance to similarity score
    print(f"\nResult {idx + 1}:")
    print(f"Source: {metadata['source']}")
    if 'page' in metadata:
        print(f"Page: {metadata['page']}")
    print(f"Similarity Score: {similarity_score:.4f}")
    #Similarity score 
    print("Content:")
    print(document)
    print("-" * 50)



Result 1:
Source: pdfs\Msc-Computer Science-Module-Handbook.pdf
Page: 2
Similarity Score: 0.1476
Content:
1 Compulsory Modules 1.1 Artificial Intelligence (M-IN-IN06)  Artificial Intelligence, Künstliche Intelligenz (KINT) ID  Workload 180h ECTS 6 Term at study start ST (start): 2 WT (start): 1 Frequency winter term Duration 1 term  1 Course Lecture plus workshops Contact time lecture  30h Contact time other  30h Self-studies 120h Planned group size  25 students   2 Learning Outcomes The students know advanced methods of artificial intelligence. Especially deep learning and deep reinforcement
--------------------------------------------------

Result 2:
Source: pdfs\Msc-Computer Science-Module-Handbook.pdf
Page: 13
Similarity Score: 0.1390
Content:
2.2 Computer Vision (M-IN-WP-35) / AI Computer Vision, aktives Sehen (COVI) ID  Workload 180h ECTS 6 Term at study start ST (start): 2 WT (start): 1 Frequency winter term Duration 1 term  1 Course Lecture Workshop Tutorial Excursion (option

# Run Local LLM

In [12]:
!pip install langchain langchain-ollama ollama

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [19]:
from langchain_ollama import OllamaLLM
model = OllamaLLM(model="llama3.2")
res_for_llm

['Source: pdfs_temp/Msc-Computer Science-Module-Handbook.pdf; Content: 2.2 Computer Vision (M-IN-WP-35) / AI Computer Vision, aktives Sehen (COVI) ID  Workload 180h ECTS 6 Term at study start ST (start): 2 WT (start): 1 Frequency winter term Duration 1 term  1 Course Lecture Workshop Tutorial Excursion (optional)',
 'Source: pdfs_temp/Msc-Computer Science-Module-Handbook.pdf; Content: 1 Compulsory Modules 1.1 Artificial Intelligence (M-IN-IN06)  Artificial Intelligence, Künstliche Intelligenz (KINT) ID  Workload 180h ECTS 6 Term at study start ST (start): 2 WT (start): 1 Frequency winter term Duration 1 term  1 Course Lecture plus workshops Contact time lecture  30h Contact time other  30h Self-studies 120h Planned group size  25 students   2 Learning Outcomes The students know advanced methods of artificial intelligence. Especially deep learning and deep reinforcement learning algorithms are understood by the students and can be applied to new problems. The students know how to train,

In [18]:
# combine query findings and initial question
combined_query = f"Related information:\n{res_for_llm}\n\nQuestion:\n{question}"

result = model.invoke(input=combined_query)
result

'The workload for the course "Artificial Intelligence" (ID M-IN-IN06) is 180h.'