# General insights into the dataset

### Answers to the following questions:
- How many documents are in the dataset?
- How many pages are there ?
- How many pages on average ?
- How many tokens (using the gpt tiktoken (izer)) are there ?
- How many tokens on average ?
- How many text chunks are there ?

In [None]:
import os
txt_dir = 'data/raw/real_estate_txts'
pdf_dir = 'data/raw/real_estate_pdfs'
TEXT_EMBEDDING_CHUNK_SIZE = 300

In [7]:
# how many pages of average
import PyPDF2
pdf_files = sorted([x for x in os.listdir(pdf_dir) if 'DS_Store' not in x])

pages_list = []
pdf_count = 0
for file in pdf_files:
    pdfFileObj = open(os.path.join(pdf_dir,file), 'rb')
    pdfReader = PyPDF2.PdfReader(pdfFileObj)
    pages_list.append(len(pdfReader.pages))
    pdf_count += 1

print(f"Number of pdfs: {pdf_count}")
print(f"Number of pages: {sum(pages_list)}")
print(f"Average number of pages per pdf: {sum(pages_list)/pdf_count}")

Number of pages: 1414
Number of pdfs: 10
Average number of pages per pdf: 141.4


In [4]:
txt_files = sorted([x for x in os.listdir(txt_dir) if 'DS_Store' not in x])
print(f"Number of text files: {len(txt_files)}")
chunks_count = 0
for file in txt_files:
    with open(os.path.join(txt_dir,file), 'r') as f:
        chunks_count += len(f.readlines()) // TEXT_EMBEDDING_CHUNK_SIZE

print(f"Number of chunks: {chunks_count}")

Number of files: 9
Number of chunks: 698


In [12]:
import tiktoken
from prep_data.clean_text import clean_text

tokenizer = tiktoken.get_encoding("cl100k_base")

token_count_list = []
for file in txt_files:
    with open(os.path.join(txt_dir,file), 'r') as f:
        text = f.read()
        text = clean_text(text)
        tokens = tokenizer.encode(text)
        token_count_list.append(len(tokens))

print(f"Number of tokens: {sum(token_count_list)}")
print(f"Average tokens per file: {sum(token_count_list) / len(txt_files)}")


Number of tokens: 473002
Average tokens per file: 52555.77777777778
