# COCA Full Text -- handlign offline versus the web gui

- goals of this notebook include 
  - understanding how to use the COCA full text data
  - staging a dictionary of dictionaries of dictionaries {'genre': {'year': {'id': text}}}

In [1]:
import os
import re
for item in [item for item in os.listdir('../coca-text/') if not item.endswith('.zip')]:
    print(os.listdir(f'../coca-text/{item}'))

['text_mag_1993.txt', 'text_mag_1992.txt', 'text_mag_1990.txt', 'text_mag_1991.txt', 'text_mag_1995.txt', 'text_mag_1994.txt', 'text_mag_1996.txt', 'text_mag_1997.txt', 'text_mag_2008.txt', 'text_mag_2009.txt', 'text_mag_2019.txt', 'text_mag_2018.txt', 'text_mag_2002.txt', 'text_mag_2016.txt', 'text_mag_2017.txt', 'text_mag_2003.txt', 'text_mag_2015.txt', 'text_mag_2001.txt', 'text_mag_2000.txt', 'text_mag_2014.txt', 'text_mag_2010.txt', 'text_mag_2004.txt', 'text_mag_2005.txt', 'text_mag_2011.txt', 'text_mag_2007.txt', 'text_mag_2013.txt', 'text_mag_2012.txt', 'text_mag_2006.txt', 'text_mag_1999.txt', 'text_mag_1998.txt']
['text_web_13.txt', 'text_web_07.txt', 'text_web_06.txt', 'text_web_12.txt', 'text_web_04.txt', 'text_web_10.txt', 'text_web_11.txt', 'text_web_05.txt', 'text_web_29.txt', 'text_web_01.txt', 'text_web_15.txt', 'text_web_14.txt', 'text_web_28.txt', 'text_web_16.txt', 'text_web_02.txt', 'text_web_03.txt', 'text_web_17.txt', 'text_web_32.txt', 'text_web_26.txt', 'text_w

In [2]:
def build_coca_dict(coca_dir='../coca-text/'):
    coca_dict = {}
    # Loop through genre folders
    for genre_folder in [f for f in os.listdir(coca_dir) if f.startswith('text_')]:
        genre = genre_folder.split('_')[1] # Extract genre from folder name like 'text_acad_isi'
        print(f'Processing genre: {genre}')
        genre_path = os.path.join(coca_dir, genre_folder)
        # Loop through files in genre folder
        for filename in os.listdir(genre_path):
            if filename.startswith('text_') and filename.endswith('.txt'):
                #print(f'  Processing file: {filename}')
                year_match = re.search(r'_(\d{4})\.txt$', filename)
                
                with open(os.path.join(genre_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
                    #print(f'    Reading file: {filename}')
                    for line in f:
                        line = line.strip()
                        if line.startswith('@@'):
                            parts = line.split(' ', 1)
                            if len(parts) == 2:
                                id_part = parts[0][3:]  # Remove '#@@'
                                text_part = parts[1]
                                
                                if year_match:
                                    year = int(year_match.group(1))
                                    #print(f'    Year: {year}')
                                    # Build nested dict: genre -> year -> id -> text
                                    coca_dict.setdefault(genre, {}).setdefault(year, {})[id_part] = text_part
                                elif genre in ['web', 'blog']:
                                    file_num_match = re.search(r'_(\d+)\.txt$', filename)
                                    if file_num_match:
                                        file_num = file_num_match.group(1)
                                        # Build nested dict: genre -> file_num -> id -> text
                                        coca_dict.setdefault(genre, {}).setdefault(file_num, {})[id_part] = text_part
    return coca_dict

In [3]:
#coca_dir='../coca-text/'
#coca_dict = build_coca_dict(coca_dir)

## Trying for got3 read of full corpora

In [4]:
import importlib
import getout_of_text_3 as got3
importlib.reload(got3)

<module 'getout_of_text_3' from '/Users/ejacquot/Documents/Github/getout_of_text_3/.venv_dev/lib/python3.11/site-packages/getout_of_text_3/__init__.py'>

In [5]:
got3.__version__

'0.2.29'

In [6]:
coca_corpus = got3.read_corpus('../coca-text/')

Genres:   0%|          | 0/8 [00:00<?, ?genre/s]

Processing genre: mag


Genres:  12%|█▎        | 1/8 [00:00<00:06,  1.09genre/s]

Finished genre: mag (total files: 30)
Processing genre: web


Genres:  25%|██▌       | 2/8 [00:01<00:05,  1.08genre/s]

Finished genre: web (total files: 34)
Processing genre: acad


Genres:  38%|███▊      | 3/8 [00:02<00:04,  1.08genre/s]

Finished genre: acad (total files: 30)
Processing genre: news


Genres:  50%|█████     | 4/8 [00:03<00:03,  1.11genre/s]

Finished genre: news (total files: 30)
Processing genre: spok


Genres:  62%|██████▎   | 5/8 [00:04<00:02,  1.11genre/s]

Finished genre: spok (total files: 30)
Processing genre: blog


Genres:  75%|███████▌  | 6/8 [00:05<00:01,  1.13genre/s]

Finished genre: blog (total files: 34)
Processing genre: fic


Genres:  88%|████████▊ | 7/8 [00:06<00:00,  1.18genre/s]

Finished genre: fic (total files: 30)
Processing genre: tvm


Genres: 100%|██████████| 8/8 [00:07<00:00,  1.14genre/s]

Finished genre: tvm (total files: 30)





In [7]:
coca_corpus.keys()

dict_keys(['mag', 'web', 'acad', 'news', 'spok', 'blog', 'fic', 'tvm'])

In [8]:
coca_corpus['mag'].keys()

dict_keys(['1993', '1992', '1990', '1991', '1995', '1994', '1996', '1997', '2008', '2009', '2019', '2018', '2002', '2016', '2017', '2003', '2015', '2001', '2000', '2014', '2010', '2004', '2005', '2011', '2007', '2013', '2012', '2006', '1999', '1998'])

In [9]:
coca_corpus['web']['13']

Unnamed: 0,text_id,text
0,5026636,<h> The Best ( and Worst ) Ways to Transition ...
1,5026637,<h> Payment Cards Center <p> The Payment Cards...
2,5026638,<h> Article Tools <h> Email <h> Share <p> You ...
3,5026736,<p> Your Javascript is Turned off . You need J...
4,5026737,<p> As October 31st draws nigh and we 're bomb...
...,...,...
2659,5259137,<h> What Shall We Do <p> Songfacts ? : You can...
2660,5259138,<p> This amped-up rocker is the first single f...
2661,5259236,"<p> Out on a dinner with some colleagues , the..."
2662,5259237,"<p> You are in charge of a massive , multinati..."


In [10]:
coca_corpus['fic']['2012']

Unnamed: 0,text_id,text
0,4120102,""" The only problem with leaving four car lengt..."
1,4120103,"Waiting for spring in Reno , Nevada , is like ..."
2,4120106,One hour down . Three hours to go . <p> The af...
3,4120107,"After dinner with the first couple , Ida , Mav..."
4,4120108,"The letter , contemplated and worried about fo..."
...,...,...
862,4162071,Day 1 Daria pays the taxi and walks into Berli...
863,4162072,A young lady who attains the grace of self-dis...
864,4162073,CHAPTER # 1 # Present Day # The sign on the of...
865,4162074,One # Regan Matthews was going to die . # She ...


In [11]:
got3.search_keyword_corpus(coca_corpus, 'freedom')

🔍 COCA Corpus Search: '{'mag': {'1993':       text_id                                               text
0     2000178  She kicked off her shoes to dance . Felicia He...
1     2000179  They did n't know him . That was the problem ....
2     2000181  Someday , someway , Joe Montana will be a San ...
3     2000182  Credit Rickey Henderson with a save . Henderso...
4     2000183  They do n't wear uniforms with their names on ...
...       ...                                                ...
1827  2108614  MORTY ROSENFELD WAS SO STONED on Euphoria , a ...
1828  2108615  ONE BIG OIL SPILL COULD BE A FLUKE . TWO SPILL...
1829  2108616  THE SPARK THIS TIME WAS A NEW BANK NOTE THAT t...
1830  2108618  There is a rising chorus for intervention in t...
1831  2108619  WHEN THE SONG COP KILLER , BY What do you watc...

[1832 rows x 2 columns], '1992':       text_id                                               text
0     2000164  The greatest conquest begins to wither the mom...
1     2000165  O

TypeError: decoding to str: need a bytes-like object, dict found

In [12]:
def flatten_corpus(nested_corpus):
    flat = {}
    for genre, years in nested_corpus.items():
        for year, df in years.items():
            flat_key = f"{genre}_{year}"
            flat[flat_key] = df
    return flat

flat_corpus = flatten_corpus(coca_corpus)

In [13]:
flat_corpus.keys()

dict_keys(['mag_1993', 'mag_1992', 'mag_1990', 'mag_1991', 'mag_1995', 'mag_1994', 'mag_1996', 'mag_1997', 'mag_2008', 'mag_2009', 'mag_2019', 'mag_2018', 'mag_2002', 'mag_2016', 'mag_2017', 'mag_2003', 'mag_2015', 'mag_2001', 'mag_2000', 'mag_2014', 'mag_2010', 'mag_2004', 'mag_2005', 'mag_2011', 'mag_2007', 'mag_2013', 'mag_2012', 'mag_2006', 'mag_1999', 'mag_1998', 'web_13', 'web_07', 'web_06', 'web_12', 'web_04', 'web_10', 'web_11', 'web_05', 'web_29', 'web_01', 'web_15', 'web_14', 'web_28', 'web_16', 'web_02', 'web_03', 'web_17', 'web_32', 'web_26', 'web_27', 'web_33', 'web_25', 'web_31', 'web_19', 'web_18', 'web_30', 'web_24', 'web_08', 'web_20', 'web_34', 'web_21', 'web_09', 'web_23', 'web_22', 'acad_2013', 'acad_2007', 'acad_2006', 'acad_2012', 'acad_2004', 'acad_2010', 'acad_2011', 'acad_2005', 'acad_2001', 'acad_2015', 'acad_2014', 'acad_2000', 'acad_2016', 'acad_2002', 'acad_2003', 'acad_2017', 'acad_1999', 'acad_1998', 'acad_1996', 'acad_1997', 'acad_1995', 'acad_1994', 'ac

In [14]:
got3.search_keyword_corpus(coca_corpus, 'hello', output='print')

🔍 COCA Corpus Search: '{'mag': {'1993':       text_id                                               text
0     2000178  She kicked off her shoes to dance . Felicia He...
1     2000179  They did n't know him . That was the problem ....
2     2000181  Someday , someway , Joe Montana will be a San ...
3     2000182  Credit Rickey Henderson with a save . Henderso...
4     2000183  They do n't wear uniforms with their names on ...
...       ...                                                ...
1827  2108614  MORTY ROSENFELD WAS SO STONED on Euphoria , a ...
1828  2108615  ONE BIG OIL SPILL COULD BE A FLUKE . TWO SPILL...
1829  2108616  THE SPARK THIS TIME WAS A NEW BANK NOTE THAT t...
1830  2108618  There is a rising chorus for intervention in t...
1831  2108619  WHEN THE SONG COP KILLER , BY What do you watc...

[1832 rows x 2 columns], '1992':       text_id                                               text
0     2000164  The greatest conquest begins to wither the mom...
1     2000165  O

TypeError: decoding to str: need a bytes-like object, dict found

In [15]:
import pandas as pd
for k, v in flat_corpus.items():
    if not isinstance(v, pd.DataFrame):
        print(f"Non-DataFrame found at {k}: {type(v)}")

In [16]:
def flatten_corpus(nested_corpus):
    import pandas as pd
    flat = {}
    for genre, years in nested_corpus.items():
        for year, df in years.items():
            if isinstance(df, pd.DataFrame):
                flat_key = f"{genre}_{year}"
                flat[flat_key] = df
            else:
                print(f"Skipping {genre}/{year}: not a DataFrame (type={type(df)})")
    return flat

flat_corpus = flatten_corpus(coca_corpus)

In [None]:
#legacy method
old_val = got3.read_corpora('../coca-samples-text/', corpora_name='test')

📚 Loading test corpus from ../coca-samples-text/
📂 Processing acad...
  ✅ text_acad.txt: (265, 1)
📂 Processing blog...
  ✅ text_blog.txt: (991, 1)
📂 Processing fic...
  ✅ text_fic.txt: (273, 1)
📂 Processing mag...
  ✅ text_mag.txt: (948, 1)
📂 Processing news...
  ✅ text_news.txt: (871, 1)
📂 Processing spok...
  ✅ text_spok.txt: (263, 1)
📂 Processing tvm...
  ✅ text_tvm.txt: (233, 1)
📂 Processing web...
  ✅ text_web.txt: (892, 1)

🎯 SUMMARY:
   - test: 8 genres loaded
   - Total corpora in collection: 1


In [59]:
# Test structure detection with a small subset
import importlib
import getout_of_text_3 as got3
importlib.reload(got3)

print(f"Version: {got3.__version__}")

# Create a test subset using all years for 'fic' and 'mag'
search_dict = {
    'fic': dict(coca_corpus['fic']),
    'mag': dict(coca_corpus['mag']),
    #'web': dict(coca_corpus['web']),
    #'blog': dict(coca_corpus['blog']),
    #'acad': dict(coca_corpus['acad']),
    #'news': dict(coca_corpus['news']),
    #'spok': dict(coca_corpus['spok']),
    #'tvm': dict(coca_corpus['tvm'])
}

print("Small nested test structure:")
for genre, years in search_dict.items():
    print(f"  {genre}: {list(years.keys())}")

results = got3.search_keyword_corpus('bovine', search_dict,  show_context=True,  output='print')

Version: 0.2.29
Small nested test structure:
  fic: ['1998', '1999', '2011', '2005', '2004', '2010', '2006', '2012', '2013', '2007', '2003', '2017', '2016', '2002', '2014', '2000', '2001', '2015', '2018', '2019', '2009', '2008', '1994', '1995', '1997', '1996', '1992', '1993', '1991', '1990']
  mag: ['1993', '1992', '1990', '1991', '1995', '1994', '1996', '1997', '2008', '2009', '2019', '2018', '2002', '2016', '2017', '2003', '2015', '2001', '2000', '2014', '2010', '2004', '2005', '2011', '2007', '2013', '2012', '2006', '1999', '1998']
🔍 COCA Corpus Search: 'bovine'

📚 FIC_1998 :
------------------------------
  📝 Text 729: dead . Valium or sheer **bovine** stupidity , thought Hamish .
  📝 Text 806: out sidways in that awkward **bovine** manner and kicking up clots
  ✅ Found 2 occurrence(s) in fic_1998

📚 FIC_1999 :
------------------------------
  📝 Text 8: of a cylindrical membrane containing **bovine** , porcine , and equine
  📝 Text 400: life outdoors . Her slightly **bovine** manne

In [60]:
results

{'fic_1998': [{'text_id': 729,
   'match': 'bovine',
   'context': 'dead . Valium or sheer **bovine** stupidity , thought Hamish .',
   'full_text': 'For there was never yet a philosopher , That could endure the toothache patiently . William Shakespe...'},
  {'text_id': 806,
   'match': 'bovine',
   'context': 'out sidways in that awkward **bovine** manner and kicking up clots',
   'full_text': 'p100 In the afternoon they sat on their bikes at the curb on Chicago Street directly across the way ...'}],
 'fic_1999': [{'text_id': 8,
   'match': 'bovine',
   'context': 'of a cylindrical membrane containing **bovine** , porcine , and equine',
   'full_text': 'Stuck in an airport ? Read this . It could be worse .... <p> " I \'m sorry , sir . Your flight \'s bee...'},
  {'text_id': 400,
   'match': 'bovine',
   'context': 'life outdoors . Her slightly **bovine** mannerisms identified her-the chewing and',
   'full_text': '( i ) <p> When she was young , mothers-or her mother , at least-would s