In [42]:
#Imports Required for the notebook
import pandas as pd
import numpy as np
import apache_beam as beam
from apache_beam.ml.transforms.base import MLTransform
from apache_beam.transforms.enrichment import Enrichment
from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings
from apache_beam.dataframe.io import read_json
from apache_beam.io.gcp.gcsio import GcsIO
import tempfile

import redis
import redis_connector
import redis_enrichment
from redis_connector import *
from redis_enrichment import *
from redis.commands.search.indexDefinition import (IndexDefinition,IndexType)
from redis.commands.search.query import Query
from redis.commands.search.field import (TextField,VectorField)


from google.cloud import storage

In [18]:
#To check beam version installed 
beam.__version__

'2.56.0'

In [19]:
#create google cloud storage client for reading data from google cloud
client = storage.Client(project = 'apache-beam-testing')

In [20]:
#creating GSCIO class object for calling different instance method
gcs = GcsIO(storage_client = client)

# TODO: Next Step
Load data from Google CLoud Storage

In [21]:
# open the json file in read mode for reading data
data = gcs.open(filename='gs://hf_wikipedia_dataset/hf_wikipedia.json',mode='r',read_buffer_size=16777216, mime_type='application/json')

In [22]:
data

<apache_beam.io.gcp.gcsio.BeamBlobReader at 0x144451c60>

In [23]:
print(data.read)

<bound method BlobReader.read of <apache_beam.io.gcp.gcsio.BeamBlobReader object at 0x144451c60>>


In [24]:
bucket = gcs.get_bucket('hf_wikipedia_dataset')
print(bucket)

<Bucket: hf_wikipedia_dataset>


# For now Reading json data locally

In [25]:
# Read data locally 
output = read_json(path ='hf_small_wikipedia.json', orient = 'records')

In [37]:
import json

with open('hf_small_wikipedia.json', 'r') as j:
     contents = json.loads(j.read())


print(contents)




JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [40]:
type(contents)

list

# Create Redis Client for connecting to Redis Vector Database

In [43]:
REDIS_HOST =  "localhost"
REDIS_PORT = 6379
REDIS_PASSWORD = "" # default for passwordless Redis

# Connect to Redis
redis_client = redis.Redis(
    host=REDIS_HOST,
    port=REDIS_PORT,
    password=REDIS_PASSWORD
)
redis_client.ping()

True

# Creating a Search Index
Below cells will show how to specify and create a search index in Redis vector DB. Below are the following steps:

1) Set some constants for defining our index like the distance metric and the index name
2) Define the index schema with RediSearch fields
3) Create the index

In [44]:
#Constants
EMBEDDING_MODEL = 'all-MiniLM-L6-v2' # Embedding model name to be use with ML Transform
VECTOR_DIM = 384                     # length of the vector for above embedding model
VECTOR_NUMBER = 2                    # initial number of vectors
INDEX_NAME = "embeddings-index"      # name of the search index      
PREFIX = "doc"                       # prefix for the document keys           
DISTANCE_METRIC = "COSINE"           # distance metric for the vectors (ex. COSINE, IP, L2)

In [45]:
# Define RediSearch fields for each of the columns in the dataset
text = TextField(name="text")
text_embedding = VectorField("text_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
fields = [text,text_embedding]

In [47]:
# Check if index exists
try:
    redis_client.ft(INDEX_NAME).info()
    print("Index already exists")
except:
    # Create RediSearch Index
    redis_client.ft(INDEX_NAME).create_index(
        fields = fields,
        definition = IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH)
)

Index already exists


# Creating Knowledge Base in Redis Vector Database
After creating a search index, we can load documents into it. We will use the same documents we used in the previous cell.

In [48]:
#Insertion Pipeline

# data = [
#     {   
#         'text': 'I love Beam'
#     },
#     {
#         'text': 'Beam is awesome'
#     },
# ]

# data = contents

artifact_location = tempfile.mkdtemp()
generate_embedding_fn = SentenceTransformerEmbeddings(model_name= EMBEDDING_MODEL,
                                                                columns=['text'])
with beam.Pipeline() as p:
    embeddings = (
        p  
        | "Read JSON data" >> read_json(path ='hf_small_wikipedia.json', orient = 'records') 
        | "Insert document in Redis" >> InsertDocInRedis(host='127.0.0.1',port=6379, batch_size=10)
        | "Print" >> beam.Map(print)
        # | "Generate Embeddings" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn) 
        # | "Insert Embedding in Redis" >> WriteToRedis(host='127.0.0.1',port=6379, batch_size=10)
    )


INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.


  | "Insert document in Redis" >> InsertDocInRedis(host='127.0.0.1',port=6379, batch_size=10)


RuntimeError: A transform with label "[48]: ToPCollection(self)" already exists in the pipeline. To apply a transform with a specified label write pvalue | "label" >> transform

## Pipeline Steps:
Create a embeddings transform, which is used for storing the text and its embedding in redis vector db


# Running Search Queries/ Perform Enrichment

## Pipeline Steps:
Create a search transform, which emits the document Id, vector score along with the matching text from knowledge base


In [38]:
#  Enchriment Pipeline currently work on test data
data = [
    {
        'text':"I love football"
    },
    {
        'text':"I am Invincible"
    },
]

# TODO: Need to test this on wikipedia dataset
artifact_location = tempfile.mkdtemp()
generate_embedding_fn = SentenceTransformerEmbeddings(model_name='jinaai/jina-embeddings-v2-base-en',
                                                                columns=['text'])

redis_handler = RedisEnrichmentHandler(redis_host='127.0.0.1', redis_port=6379)
                                       

with beam.Pipeline() as p:
  _ = (
      p
      | "Create" >> beam.Create(data)
      | "Generate Embedding" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn)
      | "Enrich W/ Redis" >> Enrichment(redis_handler)
      | "Print" >> beam.Map(print)
  )

INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: jinaai/jina-embeddings-v2-base-en
Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-base-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.10.in

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Row(text=[-0.5017814636230469, -1.1453269720077515, -2.022660493850708, 0.5617949366569519, -0.07594253122806549, -1.0716463327407837, 1.3645594120025635, -0.9534835815429688, 1.8269987106323242, 1.0323950052261353, 1.625878095626831, -0.5546425580978394, -0.5275714993476868, -0.01012003980576992, -0.2621341645717621, -1.709763765335083, -1.5618873834609985, -0.8101499676704407, 1.4576984643936157, 0.882830798625946, -1.587726354598999, -0.3200444281101227, 0.6059542894363403, 0.1171104684472084, -0.26086029410362244, 0.02267112396657467, 0.33404427766799927, 2.525123119354248, 0.2581425607204437, 3.4028115272521973, -1.7783797979354858, 0.8062160611152649, 1.1194671392440796, 1.2561248540878296, -1.1265093088150024, 0.20649032294750214, -0.2410050630569458, -2.3491196632385254, 0.9520756602287292, -1.8244831562042236, 0.8811019659042358, -1.14892578125, 0.25642699003219604, -1.4633350372314453, 0.42478179931640625, 0.5400516390800476, 1.818000078201294, 1.2350475788116455, 0.384159982

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:BatchElements statistics: element_count=2 batch_count=2 next_batch_size=2 timings=[(1, 0.09332108497619629)]


Row(text=[-0.07600141316652298, -1.3932890892028809, -2.0614399909973145, -0.8714038729667664, -1.2364329099655151, 0.3478805124759674, 0.7627274394035339, -2.122854232788086, 0.7977946400642395, 1.7402817010879517, -0.06269895285367966, 1.7256431579589844, -0.7279189825057983, -0.57814621925354, -0.9372726678848267, 0.08565990626811981, 1.2608075141906738, -0.029358232393860817, -0.9640644788742065, 2.4012093544006348, 0.16517102718353271, 0.20934347808361053, -0.7179783582687378, 0.39256319403648376, 0.33998140692710876, -0.5393403768539429, 1.3174787759780884, -0.5689770579338074, -1.1864912509918213, 2.6048786640167236, -0.13977886736392975, -0.781396746635437, -0.8316944241523743, 0.6976686716079712, 0.5793715119361877, 0.3874542713165283, -0.2611779272556305, -1.0188400745391846, 0.3273008167743683, -1.2607438564300537, 1.2402080297470093, -0.9908939599990845, 0.90080726146698, -1.6596912145614624, -1.7075554132461548, -1.2598621845245361, 0.19112332165241241, 0.5801099538803101,