In [1]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License

In [2]:
#Imports Required for the notebook
import pandas as pd
import numpy as np
import apache_beam as beam
from apache_beam.ml.transforms.base import MLTransform
from apache_beam.transforms.enrichment import Enrichment
from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings
from apache_beam.dataframe.io import read_json
from apache_beam.io.gcp.gcsio import GcsIO
import tempfile

import redis
import redis_connector
import redis_enrichment
from redis_connector import *
from redis_enrichment import *
from redis.commands.search.indexDefinition import (IndexDefinition,IndexType)
from redis.commands.search.query import Query
from redis.commands.search.field import (TextField,VectorField)
from chunks_generation import *

from google.cloud import storage

In [3]:
#To check beam version installed 
beam.__version__

'2.56.0'

In [4]:
#create google cloud storage client for reading data from google cloud
client = storage.Client(project = 'apache-beam-testing')

In [5]:
#creating GSCIO class object for calling different instance method
gcs = GcsIO(storage_client = client)

# TODO: Next Step
Load data from Google CLoud Storage

In [6]:
# open the json file in read mode for reading data
data = gcs.open(filename='gs://hf_wikipedia_dataset/hf_wikipedia.json',mode='r',read_buffer_size=16777216, mime_type='application/json')

In [7]:
data

<apache_beam.io.gcp.gcsio.BeamBlobReader at 0x143adbbe0>

In [8]:
print(data.read)

<bound method BlobReader.read of <apache_beam.io.gcp.gcsio.BeamBlobReader object at 0x143adbbe0>>


In [9]:
bucket = gcs.get_bucket('hf_wikipedia_dataset')
print(bucket)

<Bucket: hf_wikipedia_dataset>


# For now Reading json data locally

In [10]:
# Read data locally 
# output = read_json(path ='hf_small_wikipedia.json', orient = 'records')

In [11]:
import json

with open('hf_small_wikipedia.json', 'r') as j:
     contents = json.loads(j.read())


print(contents)




In [12]:
type(contents)

list

# Create Redis Client for connecting to Redis Vector Database

In [13]:
REDIS_HOST =  "localhost"
REDIS_PORT = 6379
REDIS_PASSWORD = "" # default for passwordless Redis

# Connect to Redis
redis_client = redis.Redis(
    host=REDIS_HOST,
    port=REDIS_PORT,
    password=REDIS_PASSWORD
)
redis_client.ping()

True

# Creating a Search Index
Below cells will show how to specify and create a search index in Redis vector DB. Below are the following steps:

1) Set some constants for defining our index like the distance metric and the index name
2) Define the index schema with RediSearch fields
3) Create the index

In [14]:
#Constants
EMBEDDING_MODEL = 'all-MiniLM-L6-v2' # Embedding model name to be use with ML Transform
VECTOR_DIM = 384                     # length of the vector for above embedding model
VECTOR_NUMBER = 2                    # initial number of vectors
INDEX_NAME = "embeddings-index"      # name of the search index      
PREFIX = "doc"                       # prefix for the document keys           
DISTANCE_METRIC = "COSINE"           # distance metric for the vectors (ex. COSINE, IP, L2)

In [15]:
# Define RediSearch fields for each of the columns in the dataset
url = TextField(name="url")
title = TextField(name="title")
title_embedding = VectorField("title_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)

text = TextField(name="text")
text_embedding = VectorField("text_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
fields = [url, title, title_embedding, text, text_embedding]


In [16]:
# Check if index exists
try:
    redis_client.ft(INDEX_NAME).info()
    print("Index already exists")
except:
    # Create RediSearch Index
    redis_client.ft(INDEX_NAME).create_index(
        fields = fields,
        definition = IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH)
)

Index already exists


# Creating Knowledge Base in Redis Vector Database
After creating a search index, we can load documents into it. We will use the same documents we used in the previous cell.

In [17]:
#TODO: Currently getting error while using read_json() 
#Insertion Pipeline using read_json ()



# artifact_location = tempfile.mkdtemp()
# generate_embedding_fn = SentenceTransformerEmbeddings(model_name= EMBEDDING_MODEL,
#                                                                 columns=['text'])
# with beam.Pipeline() as p:
#     embeddings = (
#         p  
#         | "Read JSON data" >> read_json(path ='hf_small_wikipedia.json', orient = 'records') 
#         | "Insert document in Redis" >> InsertDocInRedis(host='127.0.0.1',port=6379, batch_size=10)
#         | "Print" >> beam.Map(print)
#         # | "Generate Embeddings" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn) 
#         # | "Insert Embedding in Redis" >> WriteToRedis(host='127.0.0.1',port=6379, batch_size=10)
#     )


In [18]:
#Insertion Pipeline

artifact_location = tempfile.mkdtemp()
generate_embedding_fn = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2',
                                                               columns=['title','text'])
with beam.Pipeline() as p:
    embeddings = (
        p  
        | "Read data" >> beam.Create(contents) 
        | "Generate text chunks" >> ChunksGeneration(chunk_size = 500, chunk_overlap = 0, chunking_strategy = ChunkingStrategy.SPLIT_BY_TOKENS)
        | "Insert document in Redis" >> InsertDocInRedis(host='127.0.0.1',port=6379, batch_size=10)
        | "Generate Embeddings" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn) 
        | "Insert Embedding in Redis" >> InsertEmbeddingInRedis(host='127.0.0.1',port=6379, batch_size=10,embedded_columns=['title','text'])
    )

INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.


INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
2024-07-15 16:27:53.444091: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTr

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_12_section_1, key=id, value=12
Inserting doc_key=doc_12_section_1, key=url, value=https://en.wikipedia.org/wiki/Anarchism
Inserting doc_key=doc_12_section_1, key=title, value=Anarchism
Inserting doc_key=doc_12_section_1, key=text, value=anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation - states, and capitalism. anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. as a historically left - wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement ( libertarian socialism ). humans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. with the rise of organised hierarchical bod

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_12_section_11, key=id, value=12
Inserting doc_key=doc_12_section_11, key=url, value=https://en.wikipedia.org/wiki/Anarchism
Inserting doc_key=doc_12_section_11, key=title, value=Anarchism
Inserting doc_key=doc_12_section_11, key=text, value=- capitalist, anti - war and anti - globalisation movements. anarchists became known for their involvement in protests against the world trade organization ( wto ), the group of eight and the world economic forum. during the protests, ad hoc leaderless anonymous cadres known as black blocs engaged in rioting, property destruction and violent confrontations with the police. other organisational tactics pioneered at this time include affinity groups, security culture and the use of decentralised technologies such as the internet. a significant event of this period was the confrontations at the 1999 seattle wto conference. anarchist ideas have been influential in the development of the zapatistas in mexico and the democratic feder

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_12_section_21, key=id, value=12
Inserting doc_key=doc_12_section_21, key=url, value=https://en.wikipedia.org/wiki/Anarchism
Inserting doc_key=doc_12_section_21, key=title, value=Anarchism
Inserting doc_key=doc_12_section_21, key=text, value=state operators, they also engage in the struggle against fascists and racists, taking anti - fascist action and mobilizing to prevent hate rallies from happening. evolutionary anarchists commonly employ direct action. this can take the form of disrupting and protesting against unjust hierarchy, or the form of self - managing their lives through the creation of counter - institutions such as communes and non - hierarchical collectives. decision - making is often handled in an anti - authoritarian way, with everyone having equal say in each decision, an approach known as horizontalism. contemporary - era anarchists have been engaging with various grassroots movements that are more or less based on horizontalism, although not exp

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_12_section_31, key=id, value=12
Inserting doc_key=doc_12_section_31, key=url, value=https://en.wikipedia.org/wiki/Anarchism
Inserting doc_key=doc_12_section_31, key=title, value=Anarchism
Inserting doc_key=doc_12_section_31, key=text, value=, among others, are functions which could hardly be performed in a community in which there was no central government. " another common criticism of anarchism is that it fits a world of isolation in which only the small enough entities can be self - governing ; a response would be that major anarchist thinkers advocated anarchist federalism. another criticism of anarchism is the belief that it is inherently unstable : that an anarchist society would inevitably evolve back into a state. thomas hobbes and other early social contract theorists argued that the state emerges in response to natural anarchy in order to protect the people's interests and keep order. philosopher robert nozick argued that a " night - watchman state ", or

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_39_section_16, key=id, value=39
Inserting doc_key=doc_39_section_16, key=url, value=https://en.wikipedia.org/wiki/Albedo
Inserting doc_key=doc_39_section_16, key=title, value=Albedo
Inserting doc_key=doc_39_section_16, key=text, value=leads to reduced precipitation efficiency and increased lifetime of the cloud ( second indirect effect ). in extremely polluted cities like delhi, aerosol pollutants influence local weather and induce an urban cool island effect during the day. black carbon another albedo - related effect on the climate is from black carbon particles. the size of this effect is difficult to quantify : the intergovernmental panel on climate change estimates that the global mean radiative forcing for black carbon aerosols from fossil fuels is + 0. 2 w m−2, with a range + 0. 1 to + 0. 4 w m−2. black carbon is a bigger cause of the melting of the polar ice cap in the arctic than carbon dioxide due to its effect on the albedo. astronomical albedo in astro

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_303_section_5, key=id, value=303
Inserting doc_key=doc_303_section_5, key=url, value=https://en.wikipedia.org/wiki/Alabama
Inserting doc_key=doc_303_section_5, key=title, value=Alabama
Inserting doc_key=doc_303_section_5, key=text, value=the second - largest complex of the classic middle mississippian era, after cahokia in present - day illinois, which was the center of the culture. analysis of artifacts from archaeological excavations at moundville were the basis of scholars'formulating the characteristics of the southeastern ceremonial complex ( secc ). contrary to popular belief, the secc appears to have no direct links to mesoamerican culture but developed independently. the ceremonial complex represents a major component of the religion of the mississippian peoples ; it is one of the primary means by which their religion is understood. among the historical tribes of native american people living in present - day alabama at the time of european contact were th

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_303_section_15, key=id, value=303
Inserting doc_key=doc_303_section_15, key=url, value=https://en.wikipedia.org/wiki/Alabama
Inserting doc_key=doc_303_section_15, key=title, value=Alabama
Inserting doc_key=doc_303_section_15, key=text, value=1962 ) and reynolds v. sims ( 1964 ), the court ruled that the principle of " one man, one vote " needed to be the basis of both houses of state legislatures, and that their districts had to be based on population rather than geographic counties. african americans continued to press in the 1950s and 1960s to end disenfranchisement and segregation in the state through the civil rights movement, including legal challenges. in 1954, the u. s. supreme court ruled in brown v. board of education that public schools had to be desegregated, but alabama was slow to comply. during the 1960s, under governor george wallace, alabama resisted compliance with federal demands for desegregation. the civil rights movement had notable events in 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_303_section_35, key=id, value=303
Inserting doc_key=doc_303_section_35, key=url, value=https://en.wikipedia.org/wiki/Alabama
Inserting doc_key=doc_303_section_35, key=title, value=Alabama
Inserting doc_key=doc_303_section_35, key=text, value=iron and steel products ( including cast - iron and steel pipe ) ; paper, lumber, and wood products ; mining ( mostly coal ) ; plastic products ; cars and trucks ; and apparel. in addition, alabama produces aerospace and electronic products, mostly in the huntsville area, the location of nasa's george c. marshall space flight center and the u. s. army materiel command, headquartered at redstone arsenal. a great deal of alabama's economic growth since the 1990s has been due to the state's expanding automotive manufacturing industry. located in the state are honda manufacturing of alabama, hyundai motor manufacturing alabama, mercedes - benz u. s. international, and toyota motor manufacturing alabama, as well as their various su

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_303_section_55, key=id, value=303
Inserting doc_key=doc_303_section_55, key=url, value=https://en.wikipedia.org/wiki/Alabama
Inserting doc_key=doc_303_section_55, key=title, value=Alabama
Inserting doc_key=doc_303_section_55, key=text, value=, both founded in 1830. accreditation of academic programs is through the southern association of colleges and schools ( sacs ) as well as other subject - focused national and international accreditation agencies such as the association for biblical higher education ( abhe ), the council on occupational education ( coe ), and the accrediting council for independent colleges and schools ( acics ). according to the 2011 u. s. news & world report, alabama had three universities ranked in the top 100 public schools in america ( university of alabama at 31, auburn university at 36, and university of alabama at birmingham at 73 ). according to the 2012 u. s. news & world report, alabama had four tier one universities ( university of

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_305_section_23, key=id, value=305
Inserting doc_key=doc_305_section_23, key=url, value=https://en.wikipedia.org/wiki/Achilles
Inserting doc_key=doc_305_section_23, key=title, value=Achilles
Inserting doc_key=doc_305_section_23, key=text, value=the city was visited in 333 bc by alexander the great, who envisioned himself as the new achilles and carried the iliad with him, but his court biographers do not mention the spear ; however, it was shown in the time of pausanias in the 2nd century ce. achilles, ajax and a game of petteia numerous paintings on pottery have suggested a tale not mentioned in the literary traditions. at some point in the war, achilles and ajax were playing a board game ( petteia ). they were absorbed in the game and oblivious to the surrounding battle. the trojans attacked and reached the heroes, who were saved only by an intervention of athena. worship and heroic cult the tomb of achilles, extant throughout antiquity in troad, was venerated by

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_307_section_10, key=id, value=307
Inserting doc_key=doc_307_section_10, key=url, value=https://en.wikipedia.org/wiki/Abraham%20Lincoln
Inserting doc_key=doc_307_section_10, key=title, value=Abraham Lincoln
Inserting doc_key=doc_307_section_10, key=text, value=lincoln i kept my mouth shut. lincoln did not note what his children were doing or had done. " the deaths of their sons, eddie and willie, had profound effects on both parents. lincoln suffered from " melancholy ", a condition now thought to be clinical depression. later in life, mary struggled with the stresses of losing her husband and sons, and robert committed her for a time to an asylum in 1875. early career and militia service during 1831 and 1832, lincoln worked at a general store in new salem, illinois. in 1832, he declared his candidacy for the illinois house of representatives, but interrupted his campaign to serve as a captain in the illinois militia during the black hawk war. when lincoln returned

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_307_section_30, key=id, value=307
Inserting doc_key=doc_307_section_30, key=url, value=https://en.wikipedia.org/wiki/Abraham%20Lincoln
Inserting doc_key=doc_307_section_30, key=title, value=Abraham Lincoln
Inserting doc_key=doc_307_section_30, key=text, value=##vocable ". a few weeks before the war, lincoln sent a letter to every governor informing them congress had passed a joint resolution to amend the constitution. on february 11, 1861, lincoln gave a particularly emotional farewell address upon leaving springfield ; he would never again return to springfield alive. lincoln traveled east in a special train. due to secessionist plots, a then - unprecedented attention to security was given to him and his train. en route to his inauguration, lincoln addressed crowds and legislatures across the north. the president - elect evaded suspected assassins in baltimore. on february 23, 1861, he arrived in disguise in washington, d. c., which was placed under substantial m

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_307_section_60, key=id, value=307
Inserting doc_key=doc_307_section_60, key=url, value=https://en.wikipedia.org/wiki/Abraham%20Lincoln
Inserting doc_key=doc_307_section_60, key=title, value=Abraham Lincoln
Inserting doc_key=doc_307_section_60, key=text, value=have resulted in mercury poisoning. several claims have been made that lincoln's health was declining before the assassination. these are often based on photographs of lincoln appearing to show weight loss and muscle wasting. it is also suspected that he might have had a rare genetic disease such as marfan syndrome or multiple endocrine neoplasia type 2b. legacy republican values lincoln's redefinition of republican values has been stressed by historians such as john patrick diggins, harry v. jaffa, vernon burton, eric foner, and herman j. belz. lincoln called the declaration of independence — which emphasized freedom and equality for all — the " sheet anchor " of republicanism beginning in the 1850s. he did 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_308_section_30, key=id, value=308
Inserting doc_key=doc_308_section_30, key=url, value=https://en.wikipedia.org/wiki/Aristotle
Inserting doc_key=doc_308_section_30, key=title, value=Aristotle
Inserting doc_key=doc_308_section_30, key=text, value=to the faculty of imagination, phantasia. one component of aristotle's theory of dreams disagrees with previously held beliefs. he claimed that dreams are not foretelling and not sent by a divine being. aristotle reasoned naturalistically that instances in which dreams do resemble future events are simply coincidences. aristotle claimed that a dream is first established by the fact that the person is asleep when they experience it. if a person had an image appear for a moment after waking up or if they see something in the dark it is not considered a dream because they were awake when it occurred. secondly, any sensory experience that is perceived while a person is asleep does not qualify as part of a dream. for example, i

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Inserting doc_key=doc_309_section_7, key=id, value=309
Inserting doc_key=doc_309_section_7, key=url, value=https://en.wikipedia.org/wiki/An%20American%20in%20Paris
Inserting doc_key=doc_309_section_7, key=title, value=An American in Paris
Inserting doc_key=doc_309_section_7, key=text, value=, were working to make scores available to the public that represent gershwin's true intent. it was unknown whether the critical score would include the four minutes of material gershwin later deleted from the work ( such as the restatement of the blues theme after the faster 12 bar blues section ), or if the score would document changes in the orchestration during gershwin's composition process. the score to an american in paris was scheduled to be issued first in a series of scores to be released. the entire project was expected to take 30 to 40 years to complete, but an american in paris was planned to be an early volume in the series. two urtext editions of the work were published by the german 

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Inserting doc_key=doc_324_section_35, key=id, value=324
Inserting doc_key=doc_324_section_35, key=url, value=https://en.wikipedia.org/wiki/Academy%20Awards
Inserting doc_key=doc_324_section_35, key=title, value=Academy Awards
Inserting doc_key=doc_324_section_35, key=text, value=8, 2022, the academy made an announcement via a letter sent by president david rubin and ceo dawn hudson informing the public that will smith had received a ten - year ban from attending the oscars as a result of the incident. refusals of the award some winners critical of the academy awards have boycotted the ceremonies and refused to accept their oscars. the first to do so was screenwriter dudley nichols ( best writing in 1935 for the informer ). nichols boycotted the 8th academy awards ceremony because of conflicts between the academy and the writers'guild. nichols eventually accepted the 1935 award three years later, at the 1938 ceremony. nichols was nominated for three further academy awards during his car

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:BatchElements statistics: element_count=349 batch_count=23 next_batch_size=16 timings=[(1, 0.20804810523986816), (2, 0.5161137580871582), (1, 0.18439197540283203), (2, 0.24113106727600098), (1, 0.1717827320098877), (2, 0.20722103118896484), (4, 0.6056740283966064), (6, 0.7338650226593018), (12, 1.0712800025939941), (24, 1.638286828994751), (14, 1.171525001525879), (14, 0.9513740539550781), (19, 1.4254930019378662), (25, 1.769416093826294), (27, 1.8191680908203125), (27, 1.711655855178833), (26, 1.8267521858215332), (31, 2.0932071208953857), (31, 1.7969589233398438), (36, 2.6091420650482178), (35, 2.787160873413086), (8, 1.0164883136749268)]


## Pipeline Steps:
Create a embeddings transform, which is used for storing the text and its embedding in redis vector db


# Running Search Queries/ Perform Enrichment

## Pipeline Steps:
Create a search transform, which emits the document Id, vector score along with the matching text from knowledge base


In [19]:
#  Enchriment Pipeline 


data = [{'text':'What is Anarchy ?'}]

artifact_location = tempfile.mkdtemp()
generate_embedding_fn = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2',
                                                                columns=['text'])

redis_handler = RedisEnrichmentHandler(redis_host='127.0.0.1', redis_port=6379)
                                       

with beam.Pipeline() as p:
  _ = (
      p
      | "Create" >> beam.Create(data)
      | "Generate Embedding" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn)
      | "Enrich W/ Redis" >> Enrichment(redis_handler)
      | "Print" >> beam.Map(print)
  )

INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:BatchElements statistics: element_count=1 batch_count=1 next_batch_size=1 timings=[]


Row(text=[0.004666077438741922, 0.05869913473725319, -0.07399024069309235, 0.022393187507987022, 0.039686284959316254, -0.034507881850004196, 0.064857617020607, -0.047807302325963974, -0.03489216789603233, 0.06350446492433548, 0.0360037162899971, 0.03880435600876808, 0.0589592307806015, -0.0789710283279419, -0.032882459461688995, -0.045789338648319244, -0.021030493080615997, -0.05721370130777359, -0.01570642925798893, 0.06773950159549713, 0.0477975532412529, 0.02080758847296238, -0.07664106041193008, 0.04821384325623512, -0.052367933094501495, 0.07436149567365646, -0.024946363642811775, -0.03843500837683678, -0.05065334215760231, -0.008651865646243095, 0.016191929578781128, -0.05380123108625412, 0.04309113323688507, 0.0409851111471653, -0.01066699717193842, 0.021276379004120827, 0.06583339720964432, -0.05280669033527374, -0.01774919219315052, -0.061985645443201065, -0.02252737060189247, -0.012123598717153072, -0.011422254145145416, -0.029680127277970314, -0.05887051299214363, 0.0390109