In [None]:
%pip install langchain==0.0.245 weaviate-client --quiet --force-reinstall

In [None]:
import ast
import json
import os
import pandas as pd
import weaviate

In [None]:
from langchain.document_loaders.text import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

<mark>Define the load balancer for the Weaviate instance</mark>

In [None]:
elb_endpoint = ''

In [None]:
# Instantiate the client
wv_client = weaviate.Client(url=f"http://{elb_endpoint}")

In [None]:
wv_client.schema.get()

In [None]:
# ===== Import data =====
# Configure the batch import
wv_client.batch.configure(batch_size=100)

<h1>Upload Manual Metadata</h1>

In [None]:
# Settings for displaying the import progress
counter = 0
interval = 1000  # print progress every this many records

# Create a pandas dataframe iterator with lazy-loading,
# so we don't load all records in RAM at once.

json_iterator = pd.read_json(
    'manual_metadata.jsonl',
    orient='records',
    lines=True,
    chunksize=2,  # number of rows per chunk
    # nrows=350  # optionally limit the number of rows to import
)

for chunk in json_iterator:
    for index, row in chunk.iterrows():

        properties = {
            "model_names": row.model_names,
            "key_features": row.key_features,
            "company_address": row.company_address,
            "document_summary": row.document_summary,
            "stylus": row.stylus,
            "file": row.file
        }

        # Add the object to the batch, and set its vector embedding
        wv_client.batch.add_data_object(properties, "Manual") 

        # Calculate and display progress
        counter += 1
        if counter % interval == 0:
            print(f"Imported {counter} manuals ...")

wv_client.batch.flush()
print(f"Finished importing {counter} manuals.")

<h1>Upload Manual Content</h1>

In [None]:
recursive_text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2500, chunk_overlap=500, separators=[" ", ",", "\n"]
    )

In [None]:
def get_text_chunks_langchain(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separators=[" ", ",", "\n", "\n\n"])
    docs = text_splitter.split_text(text)
    return docs

In [None]:
def get_model_names(file):
    where_filter = {
        "path": ["file"],
        "operator": "Equal",
        "valueText": file
    }

    result = (
        wv_client.query
        .get("Manual", ["model_names"])
        .with_where(where_filter)
        .do()
    )

    try:
        model_names = result['data']['Get']['Manual'][0]['model_names']
    except:
        model_names = []

    return model_names

In [None]:
# Upload the Manual Content

# Settings for displaying the import progress
counter = 0
interval = 1000  # print progress every this many records

# Create a pandas dataframe iterator with lazy-loading,
# so we don't load all records in RAM at once.

# collect input files
dir_list = os.listdir('manuals/')

for file in dir_list:
    # load PDF
    loader = PyPDFLoader(f"manuals/{file}")

    model_names = get_model_names(file)

    # split into chunks
    docs = loader.load_and_split(text_splitter=recursive_text_splitter)

    for doc in docs:
        content = doc.page_content

        properties = {
            "file": file,
            "model_names": model_names,
            "content": content,
        }

        # Add the object to the batch, and set its vector embedding
        #wv_client.batch(callback=check_batch_result).add_data_object(properties, "ArticleContent")
        wv_client.batch.add_data_object(properties, "ManualContent")

    # Calculate and display progress
    counter += 1
    if counter % interval == 0:
        print(f"Imported {counter} manuals...")

wv_client.batch.flush()
print(f"Finished importing {counter} manuals.")

<h1>Upload Queries</h1>

In [None]:
df = pd.read_json('queries.jsonl',orient='records',lines=True)
df.head()

In [None]:
print(df['question'][0])

In [None]:
print(df['query'][0])

In [None]:
# Upload the Queries

# Settings for displaying the import progress
counter = 0
interval = 1000  # print progress every this many records

# Create a pandas dataframe iterator with lazy-loading,
# so we don't load all records in RAM at once.

json_iterator = pd.read_json(
    'queries.jsonl',
    orient='records',
    lines=True,
    chunksize=2,  # number of rows per chunk
    # nrows=350  # optionally limit the number of rows to import
)

for chunk in json_iterator:
    for index, row in chunk.iterrows():

        properties = {
            "question": row.question,
            "query": row.query,
        }

        # Add the object to the batch, and set its vector embedding
        wv_client.batch.add_data_object(properties, "Query") 

        # Calculate and display progress
        counter += 1
        if counter % interval == 0:
            print(f"Imported {counter} queries ...")

wv_client.batch.flush()
print(f"Finished importing {counter} queries.")