In [58]:
import pandas as pd
import warnings

warnings.filterwarnings(action="ignore", message="unclosed", category=ImportWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 
pd.set_option('display.max_colwidth', 0)

In [1]:
import openai
from helpers import get_env

API_KEY, RESOURCE_ENDPOINT = get_env("azure-openai")

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

In [60]:
import os
from config import TEXT_EMBEDDING_CHUNK_SIZE
data_dir = '../data/raw/real_estate_txts'
txt_files = sorted([x for x in os.listdir(data_dir) if 'DS_Store' not in x])
print(f"Number of files: {len(txt_files)}")
docs_count = 0
for file in txt_files:
    with open(os.path.join(data_dir,file), 'r') as f:
        docs_count += len(f.readlines()) // TEXT_EMBEDDING_CHUNK_SIZE

print(f"Number of docs: {docs_count}")

Number of files: 9
Number of docs: 698


### Get redis

In [2]:
from database import get_redis_connection

redis_client = get_redis_connection()
redis_client.ping()

True

In [3]:
import pandas as pd
keys = redis_client.keys(f"index_300t_chunks:*")
keys = [key.decode('utf-8') for key in keys]
docs = []
for key in keys:
    element_data = {'id': key}
    element_metadata = redis_client.hgetall(key)
    for key, value in element_metadata.items():
        try:
            element_data[key.decode('utf-8')] = value.decode('utf-8')
        except UnicodeDecodeError:
            element_data[key.decode('utf-8')] = value.decode('ISO-8859-1')
    docs.append(element_data)
print(f"Number of docs: {len(docs)}")
df = pd.DataFrame(docs)


Number of docs: 764


In [4]:
print(df.head())

                                                  id page  \
0  index_300t_chunks:Emerging-Trends_USCanada-202...    3   
1          index_300t_chunks:isa-outlook-2023.pdf-!6    2   
2         index_300t_chunks:isa-outlook-2023.pdf-!63   17   
3  index_300t_chunks:2022-09-21_Immobilienmarkt_D...   48   
4  index_300t_chunks:outlook-real-estatet-market-...    3   

                                            filename  \
0                  Emerging-Trends_USCanada-2023.pdf   
1                               isa-outlook-2023.pdf   
2                               isa-outlook-2023.pdf   
3  2022-09-21_Immobilienmarkt_Deutschland_2022_20...   
4   outlook-real-estatet-market-germany-dec-2022.pdf   

                                          text_chunk  \
0   No part of this publication may be reproduced...   
1   The eerily weird Lang-2525 juxtaposition impl...   
2   Supply is projected to increase in major logi...   
3   No sharp rent rises likely in spite of tight ...   
4   Such an effe

In [5]:
# drop the content_vector column
df.drop(columns=['content_vector'], inplace=True)
# save the df to a csv file
df.to_csv('index_300t_chunks.csv', index=False)

In [6]:
indices = redis_client.execute_command("FT._LIST")
print(indices)

[b'index_400t_chunks', b'index_e5_embeddings', b'index_150t_chunks', b'index_300t_chunks']


### Check that the index exists and has the right number of documents

In [8]:
from config import INDEX_NAME
# Check if index exists
try:
    redis_client.ft(INDEX_NAME).info()
    print("Index already exists")
    print(f"Number of docs in index: {redis_client.ft(INDEX_NAME).info()['num_docs']}")
except Exception as e:
    print(e)

Unknown Index name


# COMPLETLY FLUSH THE DATABASE

In [12]:
from database import get_redis_connection

redis_client = get_redis_connection()
redis_client.flushdb()

True

# Ingestion

Handled in the main.py file