In [None]:
!pip install weaviate-client --quiet

<mark>Based on Weaviate Tutorial<br></mark>
https://weaviate.io/developers/weaviate/tutorials/wikipedia

In [None]:
import ast
import boto3
import json
import pandas as pd
import weaviate

In [None]:
account_id = boto3.client('sts').get_caller_identity().get('Account')
bucket = f'weaviate-{account_id}'

<mark>Confirm the ELB endpoint URL<br></mark>

In [None]:
elb_endpoint = ''

In [None]:
# Instantiate the client 
wv_client = weaviate.Client(url=f"http://{elb_endpoint}")

In [None]:
wv_client.schema.get()

In [None]:
# define class names
article_class_name = "Article"
article_no_vector_class_name = f"ArticleNoTransformer"

In [None]:
# ===== Import data =====
# Configure the batch import
wv_client.batch.configure(
    batch_size=100,
    dynamic=True
)

# Settings for displaying the import progress
counter = 0
interval = 1000  # print progress every this many records

In [None]:
def get_iterator():
    csv_iterator = pd.read_csv(
        f's3://{bucket}/articles/vector_database_wikipedia_articles_embedded.csv',
        usecols=['id', 'url', 'title', 'text', 'content_vector'],
        chunksize=100,  # number of rows per chunk
        nrows=1000  # optionally limit the number of rows to import
    )

    return csv_iterator

In [None]:
# review one row of data

# Create a pandas dataframe iterator with lazy-loading,
# so we don't load all records in RAM at once.
csv_iterator = get_iterator()

for chunk in csv_iterator:
    for index, row in chunk.iterrows():
        title = row.title
        content = row.text
        url = row.url
        vector = ast.literal_eval(row.content_vector)
        break
    break

In [None]:
title

In [None]:
content[:1000]

In [None]:
url

In [None]:
vector[:10]

In [None]:
len(vector)

In [None]:
# re-create iterator
csv_iterator = get_iterator()

In [None]:
# Iterate through the dataframe chunks and add each CSV record to the batch
for chunk in csv_iterator:
    for index, row in chunk.iterrows():

        content = row.text

        access_terms = ['france','art','car','football','sports','food','travel']
        custom_tags = []
        for term in access_terms:
            if term in content.lower():
                custom_tags.append(term)


        if len(custom_tags) == 0:
            custom_tags.append('open')

        properties = {
            "title": row.title,
            "content": content,
            "url": row.url,
            "custom_tags": custom_tags
        }

        # Convert the vector from CSV string back to array of floats
        vector = ast.literal_eval(row.content_vector)

        # Add the object to the batch, and set its vector embedding
        wv_client.batch.add_data_object(properties, article_no_vector_class_name, vector=vector)

        # Calculate and display progress
        counter += 1
        if counter % interval == 0:
            print(f"Imported {counter} articles...")

wv_client.batch.flush()
print(f"Finished importing {counter} articles.")

In [None]:
# re-create iterator
csv_iterator = get_iterator()

In [None]:
# Iterate through the dataframe chunks and add each CSV record to the batch
counter = 0
for chunk in csv_iterator:
    for index, row in chunk.iterrows():

        content = row.text

        access_terms = ['france','art','car','football','sports','food','travel']
        custom_tags = []
        for term in access_terms:
            if term in content.lower():
                custom_tags.append(term)

        if len(custom_tags) == 0:
            custom_tags.append('open')

        properties = {
            "title": row.title,
            "content": content,
            "url": row.url,
            "custom_tags": custom_tags
        }


        # Add the object to the batch, and allow Weaviate to create the embedding 
        # NOTE this import will be slower as the text needs to be embedded 
        wv_client.batch.add_data_object(properties, article_class_name)

        # Calculate and display progress
        counter += 1
        if counter % interval == 0:
            print(f"Imported {counter} articles...")

wv_client.batch.flush()
print(f"Finished importing {counter} articles.")

In [None]:
# validate counts
response = (
    wv_client.query
    .aggregate(article_class_name)
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# validate counts
response = (
    wv_client.query
    .aggregate(article_no_vector_class_name)
    .with_meta_count()
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# validate vector length
result = (
    wv_client.query
    .get(article_class_name)
    .with_additional("vector")
    .with_limit(1)
    .do()
)

vector = result['data']['Get'][article_class_name][0]['_additional']['vector']
len(vector)

In [None]:
# validate vector length
result = (
    wv_client.query
    .get(article_no_vector_class_name)
    .with_additional("vector")
    .with_limit(1)
    .do()
)

vector = result['data']['Get'][article_no_vector_class_name][0]['_additional']['vector']
len(vector)