In [1]:
import json
import os
from bs4 import BeautifulSoup
from llama_index.core.node_parser.text import SentenceSplitter

EMBEDDING_MODEL = 'aari1995/German_Semantic_STS_V2'

def convert_html_to_text(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    return soup.get_text().strip().replace('\r','').replace('\n\n','\n')

def save_file(file, text):
    f = open(file, "w")
    f.write(text)
    f.close()

def token_length_function(tokenizer, text_input):
  return len(tokenizer.encode(text_input, add_special_tokens=False))

file_path = '../../scraper/forum_posts.json'

with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# The entire json payload is too large to be processed in memory. We take just the first 100 forum topics.
data = data[0:1000]

# Format the html inside the dictionary and remove trailing whitespaces
for item in data:
    item['title'] = convert_html_to_text(item['title'])
    for post in item['posts']:
        post['post'] = convert_html_to_text(post['post'])

# # Write the forum topics as cursive conversations for chunking.
# output_text = ""
# for item in data:
#     if output_text != "":
#         output_text += "\n\n" 
#     output_text += "Thema:" + item["title"] + "\n"
#     for post in item['posts']:
#         poster = post['poster'] if post['poster'] is not None else 'Unknown'
#         output_text += poster + ": " + post['post'] + "\n\n"
# print(output_text)
# Write the cursive text to file
# save_file("forum_posts_100.txt", output_text)

## Chunking

In [2]:
def get_chunks(data, tokenizer):
    text_splitter = SentenceSplitter(
        separator=" ",
        chunk_size=512,
        chunk_overlap=50,
        paragraph_separator="\n\n",
        tokenizer=tokenizer.encode,
    )
    chunks = []
    for item in data:
        text = "Thema:" + item["title"] + "\n"
        for post in item['posts']:
            poster = post['poster'] if post['poster'] is not None else 'Unknown'
            text += poster + ": " + post['post'] + "\n\n"
        item_chunks = text_splitter.split_text(text)
        chunks.extend(item_chunks)
    return chunks


In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
chunks = get_chunks(data, tokenizer)
len(chunks)

Token indices sequence length is longer than the specified maximum sequence length for this model (1116 > 512). Running this sequence through the model will result in indexing errors


222

In [4]:
token_lengths = [token_length_function(tokenizer, c) for c in chunks]
print(f"Max chunk size: {max(token_lengths)} and Min chunk size: {min(token_lengths)}")

Max chunk size: 501 and Min chunk size: 14


In [5]:
# print(os.environ.get('OPENAI_API_KEY'))
# print(os.environ.get('ANTHROPIC_API_KEY'))

## Creating LlamaIndex Classes

In [6]:
from llama_index.core import Document

# Load data
documents = [Document(text=chunk) for chunk in chunks]

print(len(documents))

222


In [None]:
text_splitter = SentenceSplitter(
        separator=" ",
        chunk_size=512,
        chunk_overlap=50,
        paragraph_separator="\n\n",
        tokenizer=tokenizer.encode,
    )
# Nodes from Documents
nodes = text_splitter.get_nodes_from_documents(documents)
print(f"We have parsed {len(nodes)} nodes.")

## Indexing

1. Start Docker Service
2. Start Docker configuration in `docker-compose.yaml` with `docker compose up`

In [None]:
import weaviate
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core import StorageContext, VectorStoreIndex, Settings
from llama_index.llms.anthropic import Anthropic
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
# tokenizer = Anthropic().tokenizer
# llm = Anthropic(model="claude-3-haiku-20240307")
embedding_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
# Settings.tokenizer = tokenizer
# Settings.llm = llm
Settings.embed_model = embedding_model

In [None]:
INDEX_NAME = "BabyForum"
client = weaviate.Client("http://localhost:8080")
if client.schema.exists(INDEX_NAME):
    client.schema.delete_class(INDEX_NAME)
print(client.get_meta())

In [None]:
vector_store = WeaviateVectorStore(weaviate_client = client, embed_model= embedding_model, index_name=INDEX_NAME)
storage_context = StorageContext.from_defaults(vector_store = vector_store)
index = VectorStoreIndex(nodes, storage_context = storage_context, embed_model=embedding_model, show_progress=True)

In [None]:
# index.storage_context.persist(persist_dir='weaviate-index')

In [None]:
# from llama_index.core import get_response_synthesizer

# retriever = VectorIndexRetriever(
#     index=index,
#     similarity_top_k=10,
# )
# response_synthesizer = get_response_synthesizer()

In [None]:
# query_engine = RetrieverQueryEngine(
#     retriever=retriever,
#     response_synthesizer=response_synthesizer,
#     node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.8)],
# )

In [None]:
# answer1 = query_engine.query("Kind wacht stündlich auf")
# save_file('sources.txt', str(answer1.source_nodes))
# for n in answer1.source_nodes:
#     print(f"Node {n.node_id} with similarity score {n.score}:\n{n.text}\n\n\n")

In [None]:
# answer2 = query_engine.query("Wer kümmert sich um das Kind nachts?")
# for n in answer2.source_nodes:
#     print(f"Node {n.node_id} with similarity score {n.score}:\n{n.text}\n\n\n")

In [None]:
# answer3 = query_engine.query("Wann darf man mit kind ins Tropical Island?")
# for n in answer3.source_nodes:
#     print(f"Node {n.node_id} with similarity score {n.score}:\n{n.text}\n\n\n")

In [None]:
# print(os.environ.get('OPENAI_API_KEY'))
# print(os.environ.get('ANTHROPIC_API_KEY'))

In [None]:
# if client.collections.exists("ForumPost"):
#     client.collections.delete("ForumPost")
# client.collections.create(
#     "ForumPost",
#     vectorizer_config=Configure.Vectorizer.text2vec_transformers(),
#     generative_config=Configure.Generative.openai(model='gpt-3.5-turbo'),
#     properties=[
#         Property(name="body", data_type=DataType.TEXT),
#     ]
#     )

In [None]:
# post1 = {'body': "Das ist ein langes Post über weinenende Kinder"}
# post2 = {'body': "Mein Baby isst nichts mehr tagsüber"}
# post3 = {'body': "Kind wacht Nachts weinend auf und schreit"}
# posts = client.collections.get("ForumPost")
# posts.data.insert(post1)
# posts.data.insert_many([post2, post3])

In [None]:
# posts = client.collections.get("ForumPost")
# for item in posts.iterator(include_vector=True):
#     print(item.properties)
#     print(item.vector)


In [None]:
# response = posts.query.near_text(
#         query="Baby weint",
#         return_metadata=wvc.query.MetadataQuery(distance=True),
#         limit=2
#     )

# for o in response.objects:
#     print(o.properties)
#     print(o.metadata)

In [None]:
# response = posts.query.hybrid(
#         query="Baby weint",
#         return_metadata=wvc.query.MetadataQuery(distance=True, score=True, explain_score=True),
#         alpha=0.75,
#         limit=3
#     )

# for o in response.objects:
#     print(o.properties)
#     print(o.metadata)