In [32]:
import re
import os
import gzip
import json
import unicodedata

# Download BeanCounter

Analysis in paper based on Train + Validation \
Change <"YOUR_HUGGINGFACE_USER"> to your own huggingface username; account can be made at https://huggingface.co/join \
Only run the following command to download if you have sufficient storage\
-- train split: 142 GB \
-- val split: 25MB \
-- sample split: 1.5GB \
-- deduped split: 60GB \
-- fraud split: 411MB \
You can choose to download a specific split by using --include "split_name/*"

In [None]:
!GIT_LFS_SKIP_SMUDGE=1 git clone https://<YOUR_HUGGINGFACE_USER>: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@huggingface.co/datasets/blevy41/BeanCounter
!cd BeanCounter
!git lfs pull --include "train/*"

# Set up directories & pronoun dictionary

In [39]:
ROOT_DIR = ''
BEANCOUNTER_DATASET_PATH = 'beancounter'
TRAIN_SPLIT = 'train'
VAL_SPLIT = 'validation'
SAMPLE_PATH = 'sample'
RESULTS_DIR = '' # specify dir for saving results

In [27]:
broadpronoun2variation = {
    'She': ['she', 'her', 'hers', 'herself'], 
    'He': ['he', 'him', 'his', 'himself'], 
    'Unknown': ['they', 'them', 'their', 'theirs', 'theirself', 'themself', 'themselves'],
    '1st_person': ['I', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'], 
    '2nd_person': ['you', 'your', 'yours', 'yourself', 'yourselves'],
    '3rd_person': ['it', 'its', 'itself', 'she', 'her', 'hers', 'herself', 'he', 'him', 'his', 'himself', 'they', 'them', 'their', 'theirs', 'theirself', 'themself', 'themselves'] 
}

# define text normalization and getting pronouns count of each instance

In [28]:
ONE_OR_MORE_SPACE = re.compile(r"\s+")
ALL_PUNCTUATION = re.compile(r'''(\w+(?:[.,-]\w+)*)|[!\"#\$%&\'\(\)\*\+,-\.\/:;<=>\?@\[\\\]\^_`{\|}~]+''')
PUNC_SURROUNDED_BY_SPACE = re.compile(r'''(?<!\S)[!\"#\$%&\'\(\)\*\+,-\.\/:;<=>\?@\[\\\]\^_`{\|}~]+(?!\S)''')

def text_normalization(text):
    '''normalizations for removing nonbreaking spaces, white spaces, and punctuation'''
    
    normalized_str = unicodedata.normalize('NFKD', text) # remove nonbreaking spaces
    normalized_str = re.sub(ONE_OR_MORE_SPACE, ' ', normalized_str)
    normalized_str = re.sub(ALL_PUNCTUATION, r'\1', normalized_str)
    normalized_str = re.sub(PUNC_SURROUNDED_BY_SPACE, '', normalized_str)
    return normalized_str.lower()

In [29]:
all_pronouns = list(broadpronoun2variation.values())
all_pronouns = [j for i in all_pronouns for j in i]
all_pronouns = set(all_pronouns)

In [30]:
def get_pronouns_count(cleaned_text, all_pronouns = all_pronouns):
    """
    check what kind of pronouns a single filing has
    parameters: cleaned_text - str, all_pronouns - list of strs
    return: pronoun categories with the correct flags (0 or 1, 0 means no pronouns of this category/type in the attachment and 1 means at least 1 instance)
    """
    pronouns2count = {p : 0 for p in all_pronouns}
    broad_pronoun2flag = {'She': 0, 'He': 0, 'Unknown': 0,'1st_person': 0, '2nd_person': 0, '3rd_person': 0}
    type2count = {'grammatical': 0, 'gender': 0}
    words = text_normalization(cleaned_text).split()
    pronouns_in_this_filing = set()
    for word in words:
        if word.strip().lower() in all_pronouns:
            pronouns2count[word] += 1
    
    for pronoun, count in pronouns2count.items():
        if count > 0:
            for category in broad_pronoun2flag:
                if pronoun in broadpronoun2variation[category] and broad_pronoun2flag[category] == 0:
                    broad_pronoun2flag[category] = 1
    
    if broad_pronoun2flag['She'] == 1 or broad_pronoun2flag['He'] == 1 or broad_pronoun2flag['Unknown'] == 1:
        type2count['gender'] = 1
    if  broad_pronoun2flag['1st_person'] == 1 or broad_pronoun2flag['2nd_person'] == 1 or broad_pronoun2flag['3rd_person'] == 1:
        type2count['grammatical'] = 1

    return type2count, broad_pronoun2flag

# Get pronouns count

In [40]:
# partition of dataset to perform pronoun analysis on. 
# If want to analyze whole dataset, include both TRAIN_SPLIT and VAL_SPLIT. If just sample, include SAMPLE_PATH

def get_files_to_process(train_path, validation_path):
    files_to_process = []
    # load files from train split
    for path, dirs, fns in os.walk(train_path):
        for fn in fns:
            if os.path.splitext(fn)[-1] == '.gz':
                files_to_process.append(os.path.join(path, fn))
    
    # load files from validation split
    for path, dirs, fns in os.walk(validation_path):
        for fn in fns:
            if os.path.splitext(fn)[-1] == '.gz':
                files_to_process.append(os.path.join(path, fn))
    return files_to_process

In [None]:
def work(path, out_dir = os.path.join(ROOT_DIR, RESULTS_DIR)):
    force = False
    path_out = os.path.join(out_dir, path.split('/')[4])
    fn = path.rsplit('/', 1)[1]
    
    if os.path.exists(os.path.join(path_out, fn)) and not force:
        return False

    os.makedirs(path_out, exist_ok = True)
    
    # write .lock file
    lock_file = os.path.join(path_out, fn.rsplit('.', 1)[0])+'.lock'
    open(lock_file, 'w').close()
    
    with gzip.open(path, 'r') as filings:
        all_filings = []
        for line in filings.readlines():
            desc_dict = {}
            info = json.loads(line)
            desc_dict['accession'] = info['accession']
            desc_dict['filename'] = info['filename']
            desc_dict['type_attachment'] = info['type_attachment']
            desc_dict['ts_accept'] = info['ts_accept']
            desc_dict['form_type'] = info['form_type']
            desc_dict['type_filing'] = info['type_filing']
            desc_dict['date'] = info['ts_accept'].split('T')[0]
            pronouns_info = get_pronouns_count(info['cleaned_text'])
            broad_pronouns = pronouns_info[0]
            specific_pronouns = pronouns_info[1]
            desc_dict['grammatical'] = broad_pronouns['grammatical']
            desc_dict['gender'] = broad_pronouns['gender']
            desc_dict['She'] = specific_pronouns['She']
            desc_dict['He'] = specific_pronouns['He']
            desc_dict['Unknown'] = specific_pronouns['Unknown']
            desc_dict['1st_person'] = specific_pronouns['1st_person']
            desc_dict['2nd_person'] = specific_pronouns['2nd_person']
            desc_dict['3rd_person'] = specific_pronouns['3rd_person']
            
            all_filings.append(desc_dict)
        
        with gzip.open(os.path.join(path_out, fn), 'w') as out_file:
            for entry in all_filings:
                out_file.write((json.dumps(entry) + '\n').encode())
    
    os.remove(lock_file)

    return True

In [None]:
train_path = os.path.join(ROOT_DIR, BEANCOUNTER_DATASET_PATH, TRAIN_SPLIT)
val_path = os.path.join(ROOT_DIR, BEANCOUNTER_DATASET_PATH, VAL_SPLIT)
files_to_process = get_files_to_process(train_path, val_path)
os.makedirs(os.path.join(ROOT_DIR, RESULTS_DIR), exist_ok = True)

n_works = 16 # change to desired number of workers
with mp.Pool(n_workers) as p:
    results = [r for r in tqdm(p.imap_unordered(work, files_to_process), total=len(files_to_process), miniters=1)]