# Index Data


**Imports**


In [1]:
import pandas as pd
from pandarallel import pandarallel
from sentence_transformers import SentenceTransformer
import weaviate
from weaviate.classes.config import Property, DataType
import weaviate.classes as wvc
import weaviate.classes.config as wc
import atexit
from utils import load_config
from utils import chunk_text
import tiktoken
import warnings

warnings.filterwarnings("ignore")
pandarallel.initialize(progress_bar=True)
enc = tiktoken.encoding_for_model("gpt-4o")

  from click.parser import split_arg_string
  from click.parser import split_arg_string


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Load Data


In [2]:
df = pd.read_parquet("_data/01_KRP_1994.parq")
df["token_count"] = df["text"].apply(lambda x: len(enc.encode(x)))
# For simplicity, we will filter out very long documents.
df = df[df["token_count"] <= 5_000]
cols = ["identifier", "date", "title", "ref", "text", "token_count"]
df = df[cols]
df.rename(columns={"ref": "link"}, inplace=True)
df.reset_index(drop=True, inplace=True)
df.info(memory_usage="deep")
df.to_parquet("_data/02_KRP_selec.parq", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   identifier   333 non-null    object        
 1   date         333 non-null    datetime64[ns]
 2   title        333 non-null    object        
 3   link         333 non-null    object        
 4   text         333 non-null    object        
 5   token_count  333 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 5.4 MB


# Chunk Documents


In [3]:
df = pd.read_parquet("_data/02_KRP_selec.parq")

In [4]:
# We shuffle the dataframe to make sure that parallel processing is more efficient.
results = df.sample(frac=1).parallel_apply(
    chunk_text, max_token_count=500, overlap_tokens=100, axis=1
)
df_chunks = pd.DataFrame(
    [y for x in results.tolist() for y in x], columns=["identifier", "chunk_text"]
)

df_chunks = pd.merge(
    df.drop(columns=["text"]), df_chunks, left_on="identifier", right_on="identifier"
)

df_chunks.info(memory_usage="deep")
df_chunks.to_parquet("_data/03_KRP_chunks.parq")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=34), Label(value='0 / 34'))), HBox…

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1671 entries, 0 to 1670
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   identifier   1671 non-null   object        
 1   date         1671 non-null   datetime64[ns]
 2   title        1671 non-null   object        
 3   link         1671 non-null   object        
 4   token_count  1671 non-null   int64         
 5   chunk_text   1671 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 4.7 MB


# Embed Documents


In [5]:
df = pd.read_parquet("_data/03_KRP_chunks.parq")

In [6]:
model_path = "intfloat/multilingual-e5-small"
model = SentenceTransformer(
    model_path,
    trust_remote_code=True,
    device="mps",  # Use "cuda" for CUDA GPUs, "mps" for Mac, "cpu" for CPU
)
print("Max Sequence Length:", model.max_seq_length)

Max Sequence Length: 512


In [7]:
embeddings = model.encode(
    df.chunk_text.values,
    batch_size=16,
    convert_to_tensor=False,
    normalize_embeddings=True,
    show_progress_bar=True,
)

Batches:   0%|          | 0/105 [00:00<?, ?it/s]

In [8]:
df["embeddings"] = list(embeddings)
df.to_parquet("_data/04_KRP_embed.parq")

# Index Data


In [9]:
config = load_config()

def initialize_weaviate(collection_name: str = None):
    """Initialize Weaviate client and collection"""
    if collection_name is None:
        collection_name = config["weaviate"]["collection_name"]

    try:
        client = weaviate.connect_to_local(
            port=config["weaviate"]["port"], grpc_port=config["weaviate"]["grpc_port"]
        )
    except:
        raise
    collection = client.collections.get(collection_name)

    # Register cleanup function
    def cleanup_weaviate():
        try:
            client.close()
        except Exception as e:
            raise

    atexit.register(cleanup_weaviate)

    return client, collection


client, collection = initialize_weaviate()

In [10]:
df = pd.read_parquet("_data/04_KRP_embed.parq")
df.date = pd.to_datetime(df.date, format="%Y-%m-%d")

In [11]:
# Get the meta endpoint description of weaviate.
display(client.get_meta())

# Ping Weaviate’s live and ready state.
print(client.is_live())
print(client.is_ready())

{'grpcMaxMessageSize': 104858000,
 'hostname': 'http://[::]:8080',
 'modules': {},
 'version': '1.31.5'}

True
True


In [16]:
# If the collection already exists, delete it.
try:
    client.collections.delete(config["weaviate"]["collection_name"])
except Exception as e:
    print(e)
    print(f"Collection '{config['weaviate']['collection_name']}' does not exist.")

# Create the collection with the specified properties.
client.collections.create(
    config["weaviate"]["collection_name"],
    vectorizer_config=wc.Configure.Vectorizer.none(),
    inverted_index_config=wvc.config.Configure.inverted_index(
        bm25_b=0.75,
        bm25_k1=1.2,
        # stopwords_additions=None,
        # stopwords_preset=None,
        # stopwords_removals=None,
    ),
    properties=[
        Property(name="identifier", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="text", data_type=DataType.TEXT),
    ],
)

<weaviate.collections.collection.sync.Collection at 0x175c1b5d0>

In [17]:
# List all collections.
for v in client.collections.list_all().values():
    print(v.name)

KRP_STAZH


In [18]:
# https://weaviate.io/developers/weaviate/client-libraries/python#batch-sizing
with collection.batch.fixed_size(batch_size=200, concurrent_requests=8) as batch:
    for idx, data in enumerate(df.to_dict(orient="records")):
        properties = {
            "identifier": data["identifier"],
            "title": data["title"],
            "text": data["chunk_text"],
        }
        batch.add_object(properties=properties, vector=data["embeddings"].tolist())

In [19]:
# Get total count of all items in the collection.
response = collection.aggregate.over_all(total_count=True)
print(response.total_count)

1671


## Test Lexical Search


In [20]:
query = "Was ist zu Steuerreformen entschieden worden?"

response = collection.query.bm25(
    query=query,
    # query_properties=["title"], # Define which fields to search over.
    offset=0,
    limit=10,
    auto_limit=2,
    return_metadata=wvc.query.MetadataQuery(score=True, distance=True, certainty=True),
    # filters=wvc.query.Filter.by_property("year").equal(2012),
    #  filters=wvc.query.Filter.by_property("year").less_than(2012),
    #  auto_limit=True,
)

seen = []
final_results = []

for item in response.objects:
    if item.properties["identifier"] in seen:
        continue
    final_results.append(item.properties["title"])
    seen.append(item.properties["identifier"])
for elem in final_results:
    print(elem)

Einzelinitiative Odile Wild, Zürich, vom 9. Juni 1994 betreffend Gleichstellung von Mann und Frau im Steuergesetz KR-Nr. 193/1994
Mitteilungen
Mitteilungen
Einzelinitiative Dr. Bernhard Wehrli, Feldbach, betreffend Änderung des PBG Bericht und Antrag des Regierungsrates vom 19. August 1992 und gleichlautender Antrag der Kommission vom 2. Dezember 1993 3249


## Test Hybrid Search


In [21]:
model_path = "intfloat/multilingual-e5-small"
model = SentenceTransformer(
    model_path,
    trust_remote_code=True,
    device="mps",  # Use "cuda" for GPU, "mps" for Mac, "cpu" for CPU
)

def embed_query(query):
    return model.encode(query, convert_to_tensor=False, normalize_embeddings=True)

In [22]:
query_embedding = embed_query(query)

response = collection.query.hybrid(
    query=query,
    vector=list(query_embedding),
    limit=10,
    auto_limit=2,
    alpha=0.7,
    fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE,
)

seen = []
final_results = []

for item in response.objects:
    if item.properties["identifier"] in seen:
        continue
    final_results.append(item.properties["title"])
    seen.append(item.properties["identifier"])

for elem in final_results:
    print(elem)

Einzelinitiative Odile Wild, Zürich, vom 9. Juni 1994 betreffend Gleichstellung von Mann und Frau im Steuergesetz KR-Nr. 193/1994
Verordnung über die Quellensteuer für ausländische Arbeitnehmer Quellensteuerverordnung Antrag des Regierungsrates vom 2. Februar 1994 und gleichlautender Antrag der Kommission vom 5. Mai 1994 3374 Verordnung über die Quellensteuer für natürliche und juristische Personen ohne steuerrechtlichen Wohnsitz oder Aufenthalt in der Schweiz Quellensteuerverordnung Antrag des Regierungsrates vom 2. Februar 1994 und gleichlautender Antrag der Kommission vom 5. Mai 1994 3375
Einzelinitiative Beat Müller, Zürich, vom 5. Juli 1993 betreffend Änderung des Steuergesetzes KR-Nr. 227/1993
