In [58]:
import pandas as pd
import warnings

warnings.filterwarnings(action="ignore", message="unclosed", category=ImportWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 
pd.set_option('display.max_colwidth', 0)

In [59]:
import openai
from helpers import get_env

API_KEY, RESOURCE_ENDPOINT = get_env("azure-openai")

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

In [60]:
import os
from config import TEXT_EMBEDDING_CHUNK_SIZE
data_dir = '../data/raw/real_estate_txts'
txt_files = sorted([x for x in os.listdir(data_dir) if 'DS_Store' not in x])
print(f"Number of files: {len(txt_files)}")
docs_count = 0
for file in txt_files:
    with open(os.path.join(data_dir,file), 'r') as f:
        docs_count += len(f.readlines()) // TEXT_EMBEDDING_CHUNK_SIZE

print(f"Number of docs: {docs_count}")

Number of files: 9
Number of docs: 698


### Get redis

In [61]:
from database import get_redis_connection

redis_client = get_redis_connection()
redis_client.ping()

True

### Check that the index exists and has the right number of documents

In [62]:
from config import INDEX_NAME
# Check if index exists
try:
    redis_client.ft(INDEX_NAME).info()
    print("Index already exists")
    print(f"Number of docs in index: {redis_client.ft(INDEX_NAME).info()['num_docs']}")
except Exception as e:
    print(e)

Unknown Index name


# Ingestion

Handled in the embeddings.py file

# Query

In [39]:
from database import get_redis_results

In [40]:
%%time

f1_query='how big is the due diligence team?'

result_df = get_redis_results(redis_client,f1_query,index_name=INDEX_NAME)
result_df.head(2)

ValueError: Length mismatch: Expected axis has 0 elements, new values have 3 elements

In [43]:
from database import query_redis
def get_redis_results2(redis_conn,query,index_name):
    
    # Get most relevant documents from Redis
    query_result = query_redis(redis_conn,query,index_name)
    # if the result is empty, return an empty dataframe
    if query_result.total == 0:
        return pd.DataFrame()
    
    # Extract info into a list
    query_result_list = []
    for i, result in enumerate(query_result.docs):
        print(result)
        result_order = i
        text = result.text_chunk
        score = result.vector_score
        filename = result.id
        query_result_list.append((result_order,text,score, filename))
        
    # Display result as a DataFrame for ease of us
    result_df = pd.DataFrame(query_result_list)
    result_df.columns = ['id','result','certainty', 'filename']
    return result_df

result = get_redis_results2(redis_client,"'what does tesla do?'",index_name=INDEX_NAME)
result.head(2)
