In [58]:
import pandas as pd
import warnings

warnings.filterwarnings(action="ignore", message="unclosed", category=ImportWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 
pd.set_option('display.max_colwidth', 0)

In [1]:
import openai
from helpers import get_env

API_KEY, RESOURCE_ENDPOINT = get_env("azure-openai")

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

In [60]:
import os
from config import TEXT_EMBEDDING_CHUNK_SIZE
data_dir = '../data/raw/real_estate_txts'
txt_files = sorted([x for x in os.listdir(data_dir) if 'DS_Store' not in x])
print(f"Number of files: {len(txt_files)}")
docs_count = 0
for file in txt_files:
    with open(os.path.join(data_dir,file), 'r') as f:
        docs_count += len(f.readlines()) // TEXT_EMBEDDING_CHUNK_SIZE

print(f"Number of docs: {docs_count}")

Number of files: 9
Number of docs: 698


### Get redis

In [2]:
from database import get_redis_connection

redis_client = get_redis_connection(password="weak")
redis_client.ping()

True

In [5]:
import pandas as pd
keys = redis_client.keys(f"real_estate_index:*")
keys = [key.decode('utf-8') for key in keys]
docs = []
for key in keys:
    element_data = {'id': key}
    element_metadata = redis_client.hgetall(key)
    for key, value in element_metadata.items():
        try:
            element_data[key.decode('utf-8')] = value.decode('utf-8')
        except UnicodeDecodeError:
            element_data[key.decode('utf-8')] = value.decode('ISO-8859-1')
    docs.append(element_data)
print(f"Number of docs: {len(docs)}")
df = pd.DataFrame(docs)


Number of docs: 502


In [7]:
print(df.head())

                                                  id  \
0  real_estate_index:Emerging-Trends_USCanada-202...   
1  real_estate_index:Emerging-Trends_USCanada-202...   
2  real_estate_index:Emerging-Trends_USCanada-202...   
3  real_estate_index:Emerging-Trends_USCanada-202...   
4  real_estate_index:2022-09-21_Immobilienmarkt_D...   

                                          text_chunk page  \
0   Nelson Authors, Chapter 2 Garrick Brown, Reta...    3   
1   Note: Based on U. S. respondents only. Source...   15   
2   That search will favor some of these smaller ...   20   
3   In many communities, fees, charges, and taxes...  100   
4   However, the impact on housing is difficult t...   44   

                                      content_vector  \
0  "Ñ<A\nÂ¼^\t¼Mö_¼/Ê\r;õ¸=ø®ó»¿?c¼T ¼0l½ø»Î...   
1  @¼Ã?½àDb;hz'¼è@<½.8<|½T»;Hhó¼ ø¼ªÜé<;X...   
2  ]d­ºÀ¼R];¦¼|»ft\t<TTQ»Ë^;Ú¼¤¹½îÒ<?...   
3  å<4=¼ *C¼c´¼b;9¢ I»ðj¼ø_»½Záî¼|Å<î...   
4  qü;¼ñÀ¼@

In [None]:
# save the df to a csv file
df.to_csv('real_estate_index.csv', index=False)

In [4]:
indices = redis_client.execute_command("FT._LIST")
print(indices)

[b'real_estate_index', b'real_estate2_index']


### Check that the index exists and has the right number of documents

In [62]:
from config import INDEX_NAME
# Check if index exists
try:
    redis_client.ft(INDEX_NAME).info()
    print("Index already exists")
    print(f"Number of docs in index: {redis_client.ft(INDEX_NAME).info()['num_docs']}")
except Exception as e:
    print(e)

Unknown Index name


# Ingestion

Handled in the embeddings.py file

# Query

In [39]:
from database import get_redis_results

In [40]:
%%time

f1_query='how big is the due diligence team?'

result_df = get_redis_results(redis_client,f1_query,index_name=INDEX_NAME)
result_df.head(2)

ValueError: Length mismatch: Expected axis has 0 elements, new values have 3 elements

In [43]:
from database import query_redis
def get_redis_results2(redis_conn,query,index_name):
    
    # Get most relevant documents from Redis
    query_result = query_redis(redis_conn,query,index_name)
    # if the result is empty, return an empty dataframe
    if query_result.total == 0:
        return pd.DataFrame()
    
    # Extract info into a list
    query_result_list = []
    for i, result in enumerate(query_result.docs):
        print(result)
        result_order = i
        text = result.text_chunk
        score = result.vector_score
        filename = result.id
        query_result_list.append((result_order,text,score, filename))
        
    # Display result as a DataFrame for ease of us
    result_df = pd.DataFrame(query_result_list)
    result_df.columns = ['id','result','certainty', 'filename']
    return result_df

result = get_redis_results2(redis_client,"'what does tesla do?'",index_name=INDEX_NAME)
result.head(2)


In [2]:
from openai.embeddings_utils import get_embedding
import numpy as np
embedded_query = np.array(get_embedding("What does tesla do?", engine = 'text-embedding-ada-002'), dtype=np.float32).tobytes()

In [3]:
embedded_query

b'q\nJ\xbb\xc4\xda\x94\xbc\x8cG\x8f;\xd7V \xbc\xa7\x1e!\xbc\x99\xa9|\xba\xaa\x03\x03\xbd#\xf7s9.7\xdc\xbbD\xdf\x99\xbcj\x87V;^\xb6\xab<\x062\xf6\xbb\x80/\x94\xbb#|\x8a<\xd5@\xc7\xbcC\xe8\xa3<\x7f8\x1e\xbb?+\xaf<\x1d\xd1)\xbc\x91\xdc\x96\xbc\x1c\xda\xb3\xbb4\xe2<\xbc|r\x1f;\x0c\xddV\xbcL\x12\xb3;\xa6\xb8\xed<\xcd\xc6\xdd\xbb\xcd\r.:i\xf6\x13<\xc5\xb2\'=\x19\x145\xbcY!\xa4\xbc\x1c\xda\xb3\xbb\xdd\x01\x01\xbd\x83\xae\xc2\xba2Z\x84<\x8f\xee*<]xe<c\x04\xe3\xbb\xcc\xeeJ<\xcc\xee\xca<s\xd9R\xbc\x93\x1d\xff\xba\xa4\x11R\xbc\xfbW\xc1;\xb6\x8d\xdd\xba\xc2\x86u\xbc\xcd\r.<*z\xe7\xbb\xe9\xfa\x98<0M5;M\xeaE\xbc\x9c\x97\xe8\xbaTN\xd6\xbc\xfa\x19{\xbci\xb8\xcd\xbc\xe0\xa8\x1c<\xd1\xca\xa2<"f\xb1\xbb\x17\xdfx<,\xce\x86\xbb\xc4\xda\x14\xbdr\xe2\xdc;%s\x80;`\x854\xbc\x1a*\x8e<M\xea\xc5<\xe7\xc5\\;\xd4\x90\xa1\xb9\xbc\x19\xdb\xba!o;<Q`j\xbb=4\xb9\xbb;\xac\x80<\xe1\x80\xaf\xbc\xf5\xac\xe0\xbcj\x87V\xbc\xde\xd9\x93;\x1a*\x8e;\x84\xcd%\xbbQ\xa7\xba\xbc\xe8\x03\xa3\xbc\x84gr\xbb\x9b%\x89\xbc\xc7\x81\xb0\xbb\