In [1]:
#Imports Required for the notebook
import pandas as pd
import numpy as np
import apache_beam as beam
from apache_beam.ml.transforms.base import MLTransform
from apache_beam.transforms.enrichment import Enrichment
from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings
from apache_beam.dataframe.io import read_json
from apache_beam.io.gcp.gcsio import GcsIO
import tempfile

import redis
import redis_connector
import redis_enrichment
from redis_connector import *
from redis_enrichment import *
from redis.commands.search.indexDefinition import (IndexDefinition,IndexType)
from redis.commands.search.query import Query
from redis.commands.search.field import (TextField,VectorField)


from google.cloud import storage

In [2]:
#To check beam version installed 
beam.__version__

'2.56.0'

In [3]:
#create google cloud storage client for reading data from google cloud
client = storage.Client(project = 'apache-beam-testing')

In [4]:
#creating GSCIO class object for calling different instance method
gcs = GcsIO(storage_client = client)

# TODO: Next Step
Load data from Google CLoud Storage

In [5]:
# open the json file in read mode for reading data
data = gcs.open(filename='gs://hf_wikipedia_dataset/hf_wikipedia.json',mode='r',read_buffer_size=16777216, mime_type='application/json')

In [6]:
data

<apache_beam.io.gcp.gcsio.BeamBlobReader at 0x13a5f5060>

In [7]:
print(data.read)

<bound method BlobReader.read of <apache_beam.io.gcp.gcsio.BeamBlobReader object at 0x13a5f5060>>


In [8]:
bucket = gcs.get_bucket('hf_wikipedia_dataset')
print(bucket)

<Bucket: hf_wikipedia_dataset>


# For now Reading json data locally

In [9]:
# Read data locally 
# output = read_json(path ='hf_small_wikipedia.json', orient = 'records')

In [10]:
import json

with open('hf_small_wikipedia.json', 'r') as j:
     contents = json.loads(j.read())


print(contents)




In [11]:
type(contents)

list

# Create Redis Client for connecting to Redis Vector Database

In [12]:
REDIS_HOST =  "localhost"
REDIS_PORT = 6379
REDIS_PASSWORD = "" # default for passwordless Redis

# Connect to Redis
redis_client = redis.Redis(
    host=REDIS_HOST,
    port=REDIS_PORT,
    password=REDIS_PASSWORD
)
redis_client.ping()

True

# Creating a Search Index
Below cells will show how to specify and create a search index in Redis vector DB. Below are the following steps:

1) Set some constants for defining our index like the distance metric and the index name
2) Define the index schema with RediSearch fields
3) Create the index

In [13]:
#Constants
EMBEDDING_MODEL = 'all-MiniLM-L6-v2' # Embedding model name to be use with ML Transform
VECTOR_DIM = 384                     # length of the vector for above embedding model
VECTOR_NUMBER = 2                    # initial number of vectors
INDEX_NAME = "embeddings-index"      # name of the search index      
PREFIX = "doc"                       # prefix for the document keys           
DISTANCE_METRIC = "COSINE"           # distance metric for the vectors (ex. COSINE, IP, L2)

In [14]:
# Define RediSearch fields for each of the columns in the dataset
url = TextField(name="url")
title = TextField(name="title")
title_embedding = VectorField("title_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)

text = TextField(name="text")
text_embedding = VectorField("text_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
fields = [url, title, title_embedding, text, text_embedding]


In [15]:
# Check if index exists
try:
    redis_client.ft(INDEX_NAME).info()
    print("Index already exists")
except:
    # Create RediSearch Index
    redis_client.ft(INDEX_NAME).create_index(
        fields = fields,
        definition = IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH)
)

Index already exists


# Creating Knowledge Base in Redis Vector Database
After creating a search index, we can load documents into it. We will use the same documents we used in the previous cell.

In [16]:
#TODO: Currently getting error while using read_json() 
#Insertion Pipeline using read_json ()



# artifact_location = tempfile.mkdtemp()
# generate_embedding_fn = SentenceTransformerEmbeddings(model_name= EMBEDDING_MODEL,
#                                                                 columns=['text'])
# with beam.Pipeline() as p:
#     embeddings = (
#         p  
#         | "Read JSON data" >> read_json(path ='hf_small_wikipedia.json', orient = 'records') 
#         | "Insert document in Redis" >> InsertDocInRedis(host='127.0.0.1',port=6379, batch_size=10)
#         | "Print" >> beam.Map(print)
#         # | "Generate Embeddings" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn) 
#         # | "Insert Embedding in Redis" >> WriteToRedis(host='127.0.0.1',port=6379, batch_size=10)
#     )


In [17]:
#Insertion Pipeline

artifact_location = tempfile.mkdtemp()
generate_embedding_fn = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2',
                                                               columns=['title','text'])
with beam.Pipeline() as p:
    embeddings = (
        p  
        | "Read data" >> beam.Create(contents) 
        | "Insert document in Redis" >> InsertDocInRedis(host='127.0.0.1',port=6379, batch_size=10)
        | "Generate Embeddings" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn) 
        | "Insert Embedding in Redis" >> InsertEmbeddingInRedis(host='127.0.0.1',port=6379, batch_size=10,embedded_columns=['title','text'])
    )

INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.


  | "Insert document in Redis" >> InsertDocInRedis(host='127.0.0.1',port=6379, batch_size=10)
  | "Insert Embedding in Redis" >> InsertEmbeddingInRedis(host='127.0.0.1',port=6379, batch_size=10,embedded_columns=['title','text'])
INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserting doc_key=doc_12, key=id, value=12
Inserting doc_key=doc_12, key=url, value=https://en.wikipedia.org/wiki/Anarchism
Inserting doc_key=doc_12, key=title, value=Anarchism
Inserting doc_key=doc_12, key=text, value=Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).

Humans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. A

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:BatchElements statistics: element_count=11 batch_count=8 next_batch_size=2 timings=[(1, 0.7207460403442383), (2, 2.8558189868927), (1, 0.48631715774536133), (2, 0.49552202224731445), (1, 0.3313918113708496), (2, 0.6222660541534424), (1, 0.1517958641052246)]


Inserting doc_key=doc_330, key=id, value=330
Inserting doc_key=doc_330, key=url, value=https://en.wikipedia.org/wiki/Actrius
Inserting doc_key=doc_330, key=title, value=Actrius
Inserting doc_key=doc_330, key=text, value=Actresses (Catalan: Actrius) is a 1997 Catalan language Spanish drama film produced and directed by Ventura Pons and based on the award-winning stage play E.R. by Josep Maria Benet i Jornet. The film has no male actors, with all roles played by females. The film was produced in 1996.

Synopsis 
In order to prepare herself to play a role commemorating the life of legendary actress Empar Ribera, young actress (Mercè Pons) interviews three established actresses who had been the Ribera's pupils: the international diva Glòria Marc (Núria Espert), the television star Assumpta Roca (Rosa Maria Sardà), and dubbing director Maria Caminal (Anna Lizaran).

Cast 
 Núria Espert as Glòria Marc
 Rosa Maria Sardà as Assumpta Roca
 Anna Lizaran as Maria Caminal
 Mercè Pons as Estudiant


## Pipeline Steps:
Create a embeddings transform, which is used for storing the text and its embedding in redis vector db


# Running Search Queries/ Perform Enrichment

## Pipeline Steps:
Create a search transform, which emits the document Id, vector score along with the matching text from knowledge base


In [18]:
#  Enchriment Pipeline 


data = [{'text':'What is Anarchy ?'}]

artifact_location = tempfile.mkdtemp()
generate_embedding_fn = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2',
                                                                columns=['text'])

redis_handler = RedisEnrichmentHandler(redis_host='127.0.0.1', redis_port=6379)
                                       

with beam.Pipeline() as p:
  _ = (
      p
      | "Create" >> beam.Create(data)
      | "Generate Embedding" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn)
      | "Enrich W/ Redis" >> Enrichment(redis_handler)
      | "Print" >> beam.Map(print)
  )

INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:BatchElements statistics: element_count=1 batch_count=1 next_batch_size=1 timings=[]


Row(text=[0.004666077438741922, 0.05869913473725319, -0.07399024069309235, 0.022393187507987022, 0.039686284959316254, -0.034507881850004196, 0.064857617020607, -0.047807302325963974, -0.03489216789603233, 0.06350446492433548, 0.0360037162899971, 0.03880435600876808, 0.0589592307806015, -0.0789710283279419, -0.032882459461688995, -0.045789338648319244, -0.021030493080615997, -0.05721370130777359, -0.01570642925798893, 0.06773950159549713, 0.0477975532412529, 0.02080758847296238, -0.07664106041193008, 0.04821384325623512, -0.052367933094501495, 0.07436149567365646, -0.024946363642811775, -0.03843500837683678, -0.05065334215760231, -0.008651865646243095, 0.016191929578781128, -0.05380123108625412, 0.04309113323688507, 0.0409851111471653, -0.01066699717193842, 0.021276379004120827, 0.06583339720964432, -0.05280669033527374, -0.01774919219315052, -0.061985645443201065, -0.02252737060189247, -0.012123598717153072, -0.011422254145145416, -0.029680127277970314, -0.05887051299214363, 0.0390109