# 0. Install Dependencies

In [1]:
!pip install langchain
!pip install langchain.community
!pip install pypdf
!pip install sentence-transformers
!pip install chromadb

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting langchain
  Downloading langchain-0.3.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.29 (from langchain)
  Downloading langchain_core-0.3.30-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.3 (from langchain)
  Downloading langchain_text_splitters-0.3.5-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.3,>=0.1.17 (from langchain)
  Downloading langsmith-0.2.11-py3-none-any.whl.metadata (14 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Downloading pydantic-2.10.5-py3-none-any.whl.metadata (30 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.3,>=0.1.17->langchain)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.3,>=0.1.17->langchain)
  Downloading orjson-3.10.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━

# 1. Load Data
- PDFs with general information
- CSV with website structure

In [3]:
import os
from langchain.document_loaders import PyPDFLoader, CSVLoader


def load_pdfs_and_csvs(pdf_folder, csv_folder):
    documents = []

    # Load all PDF files
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())

    # Load all CSV files
    for csv_file in os.listdir(csv_folder):
         if csv_file.endswith(".csv"):
             csv_path = os.path.join(csv_folder, csv_file)
             try:
                 loader = CSVLoader(file_path=csv_path, encoding="utf-8")
                 documents.extend(loader.load())
             except UnicodeDecodeError:
                 print(f"Error decoding file: {csv_path}. Trying with a different encoding.")
                 loader = CSVLoader(file_path=csv_path, encoding="ISO-8859-1")
                 documents.extend(loader.load())

    return documents

# Load documents from folders
pdf_folder = "res/PDFs"
csv_folder = "res/CSVs" 
raw_data = load_pdfs_and_csvs(pdf_folder, csv_folder)

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 68 0 (offset 0)
Ignoring wrong pointing object 70 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 74 0 (offset 0)
Ignoring wrong pointing object 99 0 (offset 0)
Ignoring wrong pointing object 125 0 (offset 0)
Ignoring wrong pointing object 218 0 (offset 0)
Ignoring wrong pointing object 232 0 (offset 0)


# 2. Text Splitting
- split into chunks

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=100,
)
 
split_data = text_splitter.split_documents(raw_data)

# 3. Data Embedding
- convert split data into an embedding, namely a numerical vector

In [7]:
from sentence_transformers import SentenceTransformer
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embedding_model = SentenceTransformer(model_name)

# Embedding the text chunks
embedded_chunks = []
for chunk in split_data:
    embedding = embedding_model.encode(chunk.page_content, show_progress_bar=False)
    embedded_chunks.append({
        "embedding": embedding,
        "content": chunk.page_content,
        "metadata": chunk.metadata  # Includes info like page number
    })

2025-01-15 16:45:36.343580: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-15 16:45:36.385428: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-15 16:45:36.385473: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-15 16:45:36.386745: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-15 16:45:36.394349: I tensorflow/core/platform/cpu_feature_guar

# 4. Store Embeddings in Database
- persistent storage in ChromaDB

In [9]:
import chromadb

# Initialize persistent client with a path
persistent_client = chromadb.PersistentClient(path="./chroma_db")

# Delete if exists and create new collection
try:
    persistent_client.delete_collection("th_bingen_collection")
except:
    pass

# Create fresh collection
collection = persistent_client.create_collection(name="th_bingen_collection")
# Store the embeddings
for idx, chunk in enumerate(embedded_chunks):
    collection.add(
        embeddings=[chunk["embedding"]],
        documents=[chunk["content"]],
        metadatas=[chunk["metadata"]],
        ids=[f"chunk_{idx}"]
    )

print(f"Stored {len(embedded_chunks)} chunks in ChromaDB!")

Stored 13509 chunks in ChromaDB!


# Retrieval Example
- optional to test results

In [12]:
question = "What is the workload in hours of the course artificial intelligence for Masteres Computer Sciences?"
question_embedding = embedding_model.encode(question)  # use same one as before


results = collection.query(
    query_embeddings=[question_embedding],
    n_results=4,  # get top n results (which are input to LLM)
    include=['documents', 'metadatas', 'distances']  
)

# Printing results with similarity scores
for idx, (document, metadata, distance) in enumerate(zip(results['documents'][0], results['metadatas'][0], results['distances'][0])):
    similarity_score = 1 - distance  # distance to similarity score
    print(f"\nResult {idx + 1}:")
    print(f"Source: {metadata['source']}")
    if 'page' in metadata:
        print(f"Page: {metadata['page']}")
    print(f"Similarity Score: {similarity_score:.4f}")
    #Similarity score 
    print("Content:")
    print(document)
    print("-" * 50)


Result 1:
Source: res/PDFs/BE-Industrial Engineering-Module handbook.pdf
Page: 87
Similarity Score: -9.8109
Content:
Selbststudium 
60 h 
Geplante 
Gruppengröße 
ca. 25 Studierende 
2 Lernergebnisse 
Am Ende des Moduls sind die Studierenden in der Lage: 
- Einen umfassenden Überblick der wichtigsten Begriffe und Techniken im Bereich der künstlichen 
Intelligenz zu geben 
- Grundlagen des Data-Science-Prozesses und verschiedene Ansätze im Bereich Machine Learning 
zu benennen 
- Die wichtigsten Modelle und Algorithmen künstlicher Intelligenz zu verstehen und zu beschreiben 
- Vor- und Nachteile unterschiedlicher Algorithmen bzw. Methoden zu bewerten 
- Einsatzpotential künstlicher Intelligenz im Produktionsumfeld zu identifizieren und zu bewerten 
3 Inhalte
--------------------------------------------------

Result 2:
Source: res/PDFs/BE-Mechanical Engineering-Module handbook.pdf
Page: 107
Similarity Score: -9.8324
Content:
1 SWS / 15 h 
Selbststudium 
60 h 
Geplante 
Gruppengröße 
ca.

# Install LLM dependencies

In [2]:
!pip install torch==2.1.2
!pip install transformers
!pip install auto-gptq
!pip install accelerate
!pip install optimum
!pip install optimum[auto-gptq]

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting accelerate>=0.26.0 (from auto-gptq)
  Downloading accelerate-1.3.0-py3-none-any.whl.metadata (19 kB)
Collecting datasets (from auto-gptq)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting sentencepiece (from auto-gptq)
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.2.1-py3-none-any.whl.metadata (3.0 kB)
Collecting peft>=0.5.0 (from auto-gptq)
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)

In [5]:
!pip uninstall xformers mistral_inference bitsandbytes -y

Found existing installation: xformers 0.0.23
Uninstalling xformers-0.0.23:
  Successfully uninstalled xformers-0.0.23
[0mFound existing installation: bitsandbytes 0.45.0
Uninstalling bitsandbytes-0.45.0:
  Successfully uninstalled bitsandbytes-0.45.0


# Download LLM

In [3]:
from huggingface_hub import snapshot_download
from pathlib import Path
from huggingface_hub import login
login(token="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")
mistral_models_path = Path.home().joinpath('Final Structure', 'Mistral', 'CapybaraHermes-GPTQ')
mistral_models_path.mkdir(parents=True, exist_ok=True)

snapshot_download(
    repo_id="TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ",
    local_dir=mistral_models_path
)

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

'/home/jovyan/Final Structure/Mistral/CapybaraHermes-GPTQ'