In [1]:
import glob
from tqdm import tqdm
raw_sources = glob.glob('../coca/text/text*.txt')

In [2]:
def preproc_general(fpath):

    with open(fpath, 'r', encoding='utf-8') as file:
        print('loading text from: ', fpath)
        text = file.read()

    texts = []
    year = fpath.split('/')[-1].split('_')[-1].split('.')[0]
    text = text.replace('<p>', ' ')
    text = text.replace('<h>', ' ')
    text = text.replace('@!', ' ')
    text = text.replace('@ @ @ @ @ @ @ @ @ @', '[MASK_NOLOSS]')
    text = text.split('@@')
    for e in text:
        if e.strip():
            # Find the first whitespace and split into 2 parts
            first_space_index = e.find(' ')
            if first_space_index != -1:
                docid = e[:first_space_index]
                content = '[YEAR:{year}] '.format(year=year) + e[first_space_index + 1:]
                texts.append(content)
    return texts

In [4]:
filenum_to_year = {}
with open('../coca/sources.txt', 'r', encoding='utf-8', errors='ignore') as file:
    for line in file:
        parts = line.split('\t')
        try:
            filenum_to_year[parts[0]] = int(parts[1])
        except Exception as e:
            print(parts)

In [13]:
from collections import Counter

year_counts = Counter(filenum_to_year.values())
for year, count in sorted(year_counts.items()):
    print(f"Year: {year}, Count: {count}")


Year: 1990, Count: 7109
Year: 1991, Count: 7307
Year: 1992, Count: 7696
Year: 1993, Count: 8023
Year: 1994, Count: 8251
Year: 1995, Count: 8412
Year: 1996, Count: 8111
Year: 1997, Count: 8769
Year: 1998, Count: 8841
Year: 1999, Count: 9009
Year: 2000, Count: 10142
Year: 2001, Count: 9395
Year: 2002, Count: 10055
Year: 2003, Count: 10595
Year: 2004, Count: 10069
Year: 2005, Count: 9762
Year: 2006, Count: 9937
Year: 2007, Count: 9796
Year: 2008, Count: 9361
Year: 2009, Count: 9587
Year: 2010, Count: 9814
Year: 2011, Count: 11052
Year: 2012, Count: 198053
Year: 2013, Count: 9177
Year: 2014, Count: 9490
Year: 2015, Count: 9497
Year: 2016, Count: 14800
Year: 2017, Count: 15033
Year: 2018, Count: 14032
Year: 2019, Count: 14004


In [15]:
filenum_to_year['5028898']

2012

In [66]:
def preproc_blog_web(fpath):

    with open(fpath, 'r', encoding='utf-8') as file:
        print('loading text from: ', fpath)
        text = file.read()

    texts = []
    text = text.replace('<p>', ' ')
    text = text.replace('<h>', ' ')
    text = text.replace('&', ' ')
    text = text.replace('@ @ @ @ @ @ @ @ @ @', '[MASK_NOLOSS]')
    text = text.split('@@')
    for e in text:
        if e.strip():
            # Find the first whitespace and split into 2 parts
            first_space_index = e.find(' ')
            if first_space_index != -1:
                docid = e[:first_space_index]
                text = e[first_space_index + 1:]
                if docid in filenum_to_year:
                    year = filenum_to_year[docid]
                    content = '[YEAR:{year}] '.format(year=year) + e[first_space_index + 1:]
                    texts.append(content)
                else:
                    print(f"docid {docid} not found in filenum_to_year")
    return texts

In [67]:
def preproc_file(fpath):


    if 'acad' in fpath:
        texts = preproc_general(fpath)
    elif 'blog' in fpath:
        texts = preproc_blog_web(fpath)
    elif 'fic' in fpath:
        texts = preproc_general(fpath)
    elif 'mag' in fpath:
        texts = preproc_general(fpath)
    elif 'news' in fpath:
        texts = preproc_general(fpath)
    elif 'spok' in fpath:
        texts = preproc_general(fpath)
    elif 'tvm' in fpath:
        texts = preproc_general(fpath)
    elif 'web' in fpath:
        texts = preproc_blog_web(fpath)


    else:                
        raise NotImplementedError(f"Genre type not supported: {fpath}")

    return texts


In [76]:
from transformers import BertTokenizerFast

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')



In [77]:
# Add new special tokens
additional_special_tokens = ['[MASK_NOLOSS]'] + ['[YEAR:{i}]'.format(i=i) for i in range(1900, 2025)]
special_tokens_dict = {'additional_special_tokens': additional_special_tokens}
tokenizer.add_special_tokens(special_tokens_dict)

126

In [79]:
def tokenize_source(source):
    sequence_length = 512
    overlap = 128
    step_size = sequence_length - overlap
    for text in preproc_file(source):
        tokens = tokenizer.encode(text, add_special_tokens=False)
        sequences = [tokens[i:i + sequence_length] for i in range(0, len(tokens), step_size)]
    return sequences

In [86]:
test_sequences = tokenize_source(raw_sources[0])

loading text from:  ./helivan-project-generation/bstadt/tlm/coca/text/text_acad_1990.txt


In [122]:
def get_file_idx(cumulative_lengths, idx):
    return int(np.searchsorted(cumulative_lengths, idx))

import numpy as np
file_lenghts = [100, 110, 220, 330]
cumulative_lengths = np.cumsum(file_lenghts)
print(cumulative_lengths)


[100 210 430 760]


In [123]:
get_file_idx(cumulative_lengths, 100) 

0