# General insights into the dataset

### Answers to the following questions:
- How many documents are in the dataset?
- How many pages are there ?
- How many pages on average ?
- How many tokens (using the gpt tiktoken (izer)) are there ?
- How many tokens on average ?
- How many text chunks are there ?

In [1]:
import os
txt_dir = 'data/raw/txt2'
pdf_dir = 'data/raw/pdfs2'
chunk_df_path = 'evaluation/data/chunks.csv'
TEXT_EMBEDDING_CHUNK_SIZE = 300

In [16]:
# how many pages of average
import PyPDF2
pdf_files = sorted([x for x in os.listdir(pdf_dir) if 'DS_Store' not in x])

pages_list = []
pdf_count = 0
for file in pdf_files:
    pdfFileObj = open(os.path.join(pdf_dir,file), 'rb')
    pdfReader = PyPDF2.PdfReader(pdfFileObj)
    pages_list.append(len(pdfReader.pages))
    pdf_count += 1

print(f"Number of pdfs: {pdf_count}")
print(f"Number of pages: {sum(pages_list)}")
print(f"Average number of pages per pdf: {sum(pages_list)/pdf_count}")

Number of pdfs: 9
Number of pages: 304
Average number of pages per pdf: 33.77777777777778


In [15]:
txt_files = sorted([x for x in os.listdir(txt_dir) if 'DS_Store' not in x])
print(f"Number of text files: {len(txt_files)}")

Number of text files: 9


In [8]:
import pandas as pd
import numpy as np

def str_to_array_converter(string):
    # Remove leading/trailing brackets and split the string by commas
    elements = string[1:-1].split(',')
    # Convert each element to float32 and create a numpy array
    array = np.array(elements, dtype=np.float32)
    return array

textchunks_df = pd.read_csv("evaluation/data/chunks.csv", converters={"embedding": str_to_array_converter})
textchunks_df = textchunks_df[textchunks_df["text"].notna()]

print(f"Number of text chunks: {len(textchunks_df)}")

Number of text chunks: 455


In [10]:
# unique filenames
print(textchunks_df['filename'].unique())

['2022-09-21_Immobilienmarkt_Deutschland_2022_2023_EN.pdf'
 '2022-12-12_ABG_Statements_Hoeller_2022-2023_en.pdf'
 '2023_Housing_market_outlook__Price_dip_and_interes.pdf'
 'Deloitte 2023 Commercial Real Estate Outlook.pdf'
 'Emerging-Trends_USCanada-2023.pdf'
 'bouwinvest_international-market-outlook_2023-2025-1.pdf'
 'global-real-estate-markets-2023.pdf' 'isa-outlook-2023.pdf'
 'outlook-real-estatet-market-germany-dec-2022.pdf']


In [5]:
import os
import pandas as pd
import re

def replace_semicolon(text, threshold=10):
    '''
    Get rid of semicolons.

    First split text into fragments between the semicolons. If the fragment 
    is longer than the threshold, turn the semicolon into a period. O.w treat
    it as a comma.

    Returns new text
    '''
    new_text = ""
    for subset in re.split(';', text):
        subset = subset.strip() # Clear off spaces
        # Check word count
        if len(subset.split()) > threshold:
            # Turn first char into uppercase
            new_text += ". " + subset[0].upper() + subset[1:]
        else:
            # Just append with a comma 
            new_text += ", " + subset

    return new_text

USC_re = re.compile('[Uu]\.*[Ss]\.*[Cc]\.]+') 
PAREN_re = re.compile('\([^(]+\ [^\(]+\)')
BAD_PUNCT_RE = re.compile(r'([%s])' % re.escape('"#%&\*\+/<=>@[\]^{|}~_'), re.UNICODE)
BULLET_RE = re.compile('\n[\ \t]*`*\([a-zA-Z0-9]*\)')
DASH_RE = re.compile('--+')
WHITESPACE_RE = re.compile('\s+')
EMPTY_SENT_RE = re.compile('[,\.]\ *[\.,]')
FIX_START_RE = re.compile('^[^A-Za-z]*')
FIX_PERIOD = re.compile('\.([A-Za-z])')


def clean_text(text):
    """
    Borrowed from the FNDS text processing with additional logic added in.
    Note: we do not take care of token breaking - assume a tokenizer
    will handle this for us.
    """

    # Remove parantheticals 
    text = PAREN_re.sub('', text)

    # Get rid of enums as bullets or ` as bullets
    text = BULLET_RE.sub(' ',text)
    
    # Clean html 
    text = text.replace('&lt;all&gt;', '')

    # Remove annoying punctuation, that's not relevant
    text = BAD_PUNCT_RE.sub('', text)

    # Get rid of long sequences of dashes - these are formating
    text = DASH_RE.sub( ' ', text)

    # removing newlines, tabs, and extra spaces.
    text = WHITESPACE_RE.sub(' ', text)
    
    # If we ended up with "empty" sentences - get rid of them.
    text = EMPTY_SENT_RE.sub('.', text)
    
    # Attempt to create sentences from bullets 
    text = replace_semicolon(text)
    
    # Fix weird period issues + start of text weirdness
    #text = re.sub('\.(?=[A-Z])', '  . ', text)
    # Get rid of anything thats not a word from the start of the text
    text = FIX_START_RE.sub( '', text)
    # Sometimes periods get formatted weird, make sure there is a space between periods and start of sent   
    text = FIX_PERIOD.sub(". \g<1>", text)

    # Fix quotes
    text = text.replace('``', '"')
    text = text.replace('\'\'', '"')

    # remove ambigous unicode characters
    text = text.encode('ascii', 'ignore').decode('ascii')

    return text

In [19]:
import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")

token_count_list = []
for file in txt_files:
    with open(os.path.join(txt_dir,file), 'r') as f:
        text = f.read()
        text = clean_text(text)
        tokens = tokenizer.encode(text)
        token_count_list.append(len(tokens))

print(f"Number of tokens: {sum(token_count_list)}")
print(f"Average tokens per file: {sum(token_count_list) / len(txt_files)}")


Number of tokens: 204023
Average tokens per file: 22669.222222222223
