In [None]:
!pip install weaviate-client --quiet

In [None]:
import ast
import json
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
import pandas as pd
import weaviate

In [None]:
wv_client = weaviate.Client(url="http://k8s-weaviate-weaviate-f1e22650f9-583768105cf15612.elb.us-west-2.amazonaws.com") # TODO - add to env vars

In [None]:
bucket = 'weaviate-20230718214239852900000002' # TODO - add to env vars

In [None]:
# ===== Import data =====
# Configure the batch import
wv_client.batch.configure(
    batch_size=100,
)

# Settings for displaying the import progress
counter = 0
interval = 1000  # print progress every this many records

In [None]:
def get_iterator():
    csv_iterator = pd.read_csv(
        f's3://{bucket}/vector_database_wikipedia_articles_embedded.csv',
        usecols=['id', 'url', 'title', 'text', 'content_vector'],
        chunksize=100,  # number of rows per chunk
        #nrows=100  # optionally limit the number of rows to import
    )

    return csv_iterator

In [None]:
# review one row of data

# Create a pandas dataframe iterator with lazy-loading,
# so we don't load all records in RAM at once.
csv_iterator = get_iterator()

for chunk in csv_iterator:
    for index, row in chunk.iterrows():
        title = row.title
        content = row.text
        url = row.url
        vector = ast.literal_eval(row.content_vector)
        break
    break

In [None]:
title

In [None]:
content[:1000]

In [None]:
url

In [None]:
vector[:10]

In [None]:
len(vector)

In [None]:
# re-create iterator
csv_iterator = get_iterator()

In [None]:
# Iterate through the dataframe chunks and add each CSV record to the batch
for chunk in csv_iterator:
    for index, row in chunk.iterrows():

        content = row.text

        access_terms = ['france','art','car','football','sports','food','travel']
        c_access = []
        for term in access_terms:
            if term in content.lower():
                c_access.append(term)


        if len(c_access) == 0:
            c_access.append('open')

        properties = {
            "title": row.title,
            "content": content,
            "url": row.url,
            "c_access": c_access
        }

        # Convert the vector from CSV string back to array of floats
        vector = ast.literal_eval(row.content_vector)

        # Add the object to the batch, and set its vector embedding
        wv_client.batch.add_data_object(properties, "ArticleNoTransformer", vector=vector)

        # Calculate and display progress
        counter += 1
        if counter % interval == 0:
            print(f"Imported {counter} articles...")

wv_client.batch.flush()
print(f"Finished importing {counter} articles.")

In [None]:
# re-create iterator
csv_iterator = get_iterator()

In [None]:
# Iterate through the dataframe chunks and add each CSV record to the batch
counter = 0
for chunk in csv_iterator:
    for index, row in chunk.iterrows():

        content = row.text

        access_terms = ['france','art','car','football','sports','food','travel']
        c_access = []
        for term in access_terms:
            if term in content.lower():
                c_access.append(term)

        if len(c_access) == 0:
            c_access.append('open')

        properties = {
            "title": row.title,
            "content": content,
            "url": row.url,
            "c_access": c_access
        }


        # Add the object to the batch, and allow Weaviate to create the embedding 
        # NOTE this import will be slower as the text needs to be embedded 
        wv_client.batch.add_data_object(properties, "Article")

        # Calculate and display progress
        counter += 1
        if counter % interval == 0:
            print(f"Imported {counter} articles...")

wv_client.batch.flush()
print(f"Finished importing {counter} articles.")

In [None]:
# validate counts
response = (
    wv_client.query
    .aggregate("Article")
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# validate counts
response = (
    wv_client.query
    .aggregate("ArticleNoTransformer")
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# validate vector length
result = (
    wv_client.query
    .get("Article")
    .with_additional("vector")
    .with_limit(1)
    .do()
)

vector = result['data']['Get']['Article'][0]['_additional']['vector']
len(vector)

In [None]:
# validate vector length
result = (
    wv_client.query
    .get("ArticleCustom")
    .with_additional("vector")
    .with_limit(1)
    .do()
)

vector = result['data']['Get']['ArticleCustom'][0]['_additional']['vector']
len(vector)