# COCA Full Text -- handlign offline versus the web gui

- goals of this notebook include 
  - understanding how to use the COCA full text data
  - staging a dictionary of dictionaries of dictionaries {'genre': {'year': {'id': text}}}

In [1]:
import os
import re
for item in [item for item in os.listdir('../coca-text/') if not item.endswith('.zip')]:
    print(os.listdir(f'../coca-text/{item}'))

['text_mag_1993.txt', 'text_mag_1992.txt', 'text_mag_1990.txt', 'text_mag_1991.txt', 'text_mag_1995.txt', 'text_mag_1994.txt', 'text_mag_1996.txt', 'text_mag_1997.txt', 'text_mag_2008.txt', 'text_mag_2009.txt', 'text_mag_2019.txt', 'text_mag_2018.txt', 'text_mag_2002.txt', 'text_mag_2016.txt', 'text_mag_2017.txt', 'text_mag_2003.txt', 'text_mag_2015.txt', 'text_mag_2001.txt', 'text_mag_2000.txt', 'text_mag_2014.txt', 'text_mag_2010.txt', 'text_mag_2004.txt', 'text_mag_2005.txt', 'text_mag_2011.txt', 'text_mag_2007.txt', 'text_mag_2013.txt', 'text_mag_2012.txt', 'text_mag_2006.txt', 'text_mag_1999.txt', 'text_mag_1998.txt']
['text_web_13.txt', 'text_web_07.txt', 'text_web_06.txt', 'text_web_12.txt', 'text_web_04.txt', 'text_web_10.txt', 'text_web_11.txt', 'text_web_05.txt', 'text_web_29.txt', 'text_web_01.txt', 'text_web_15.txt', 'text_web_14.txt', 'text_web_28.txt', 'text_web_16.txt', 'text_web_02.txt', 'text_web_03.txt', 'text_web_17.txt', 'text_web_32.txt', 'text_web_26.txt', 'text_w

In [2]:
def build_coca_dict(coca_dir='../coca-text/'):
    coca_dict = {}
    # Loop through genre folders
    for genre_folder in [f for f in os.listdir(coca_dir) if f.startswith('text_')]:
        genre = genre_folder.split('_')[1] # Extract genre from folder name like 'text_acad_isi'
        print(f'Processing genre: {genre}')
        genre_path = os.path.join(coca_dir, genre_folder)
        # Loop through files in genre folder
        for filename in os.listdir(genre_path):
            if filename.startswith('text_') and filename.endswith('.txt'):
                #print(f'  Processing file: {filename}')
                year_match = re.search(r'_(\d{4})\.txt$', filename)
                
                with open(os.path.join(genre_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
                    #print(f'    Reading file: {filename}')
                    for line in f:
                        line = line.strip()
                        if line.startswith('@@'):
                            parts = line.split(' ', 1)
                            if len(parts) == 2:
                                id_part = parts[0][3:]  # Remove '#@@'
                                text_part = parts[1]
                                
                                if year_match:
                                    year = int(year_match.group(1))
                                    #print(f'    Year: {year}')
                                    # Build nested dict: genre -> year -> id -> text
                                    coca_dict.setdefault(genre, {}).setdefault(year, {})[id_part] = text_part
                                elif genre in ['web', 'blog']:
                                    file_num_match = re.search(r'_(\d+)\.txt$', filename)
                                    if file_num_match:
                                        file_num = file_num_match.group(1)
                                        # Build nested dict: genre -> file_num -> id -> text
                                        coca_dict.setdefault(genre, {}).setdefault(file_num, {})[id_part] = text_part
    return coca_dict

In [3]:
#coca_dir='../coca-text/'
#coca_dict = build_coca_dict(coca_dir)

## Trying for got3 read of full corpora

In [4]:
import importlib
import getout_of_text_3 as got3
importlib.reload(got3)

<module 'getout_of_text_3' from '/Users/ejacquot/Documents/Github/getout_of_text_3/.venv_dev/lib/python3.11/site-packages/getout_of_text_3/__init__.py'>

In [5]:
got3.__version__

'0.2.34'

In [6]:
coca_corpus = got3.read_corpus('../coca-text/')

Genres:   0%|          | 0/8 [00:00<?, ?genre/s]

Processing genre: mag


Genres:  12%|█▎        | 1/8 [00:00<00:06,  1.03genre/s]

Finished genre: mag (total files: 30)
Processing genre: web


Genres:  25%|██▌       | 2/8 [00:01<00:05,  1.03genre/s]

Finished genre: web (total files: 34)
Processing genre: acad


Genres:  38%|███▊      | 3/8 [00:02<00:04,  1.03genre/s]

Finished genre: acad (total files: 30)
Processing genre: news


Genres:  50%|█████     | 4/8 [00:03<00:03,  1.04genre/s]

Finished genre: news (total files: 30)
Processing genre: spok


Genres:  62%|██████▎   | 5/8 [00:04<00:02,  1.05genre/s]

Finished genre: spok (total files: 30)
Processing genre: blog


Genres:  75%|███████▌  | 6/8 [00:05<00:01,  1.06genre/s]

Finished genre: blog (total files: 34)
Processing genre: fic


Genres:  88%|████████▊ | 7/8 [00:06<00:00,  1.10genre/s]

Finished genre: fic (total files: 30)
Processing genre: tvm


Genres: 100%|██████████| 8/8 [00:07<00:00,  1.07genre/s]

Finished genre: tvm (total files: 30)





In [7]:
coca_corpus.keys()

dict_keys(['mag', 'web', 'acad', 'news', 'spok', 'blog', 'fic', 'tvm'])

In [8]:
coca_corpus['mag'].keys()

dict_keys(['1993', '1992', '1990', '1991', '1995', '1994', '1996', '1997', '2008', '2009', '2019', '2018', '2002', '2016', '2017', '2003', '2015', '2001', '2000', '2014', '2010', '2004', '2005', '2011', '2007', '2013', '2012', '2006', '1999', '1998'])

In [9]:
type(coca_corpus['web']['13'])

pandas.core.frame.DataFrame

In [10]:
coca_corpus['web']['13']

Unnamed: 0,text_id,text
0,5026636,<h> The Best ( and Worst ) Ways to Transition ...
1,5026637,<h> Payment Cards Center <p> The Payment Cards...
2,5026638,<h> Article Tools <h> Email <h> Share <p> You ...
3,5026736,<p> Your Javascript is Turned off . You need J...
4,5026737,<p> As October 31st draws nigh and we 're bomb...
...,...,...
2659,5259137,<h> What Shall We Do <p> Songfacts ? : You can...
2660,5259138,<p> This amped-up rocker is the first single f...
2661,5259236,"<p> Out on a dinner with some colleagues , the..."
2662,5259237,"<p> You are in charge of a massive , multinati..."


In [11]:
coca_corpus['fic']['2012']

Unnamed: 0,text_id,text
0,4120102,""" The only problem with leaving four car lengt..."
1,4120103,"Waiting for spring in Reno , Nevada , is like ..."
2,4120106,One hour down . Three hours to go . <p> The af...
3,4120107,"After dinner with the first couple , Ida , Mav..."
4,4120108,"The letter , contemplated and worried about fo..."
...,...,...
862,4162071,Day 1 Daria pays the taxi and walks into Berli...
863,4162072,A young lady who attains the grace of self-dis...
864,4162073,CHAPTER # 1 # Present Day # The sign on the of...
865,4162074,One # Regan Matthews was going to die . # She ...


In [33]:
#legacy method
old_val = got3.read_corpora('../coca-samples-text/', corpora_name='test')

📚 Loading test corpus from ../coca-samples-text/
📂 Processing acad...
  ✅ text_acad.txt: (265, 1)
📂 Processing blog...
  ✅ text_blog.txt: (991, 1)
📂 Processing fic...
  ✅ text_fic.txt: (273, 1)
📂 Processing mag...
  ✅ text_blog.txt: (991, 1)
📂 Processing fic...
  ✅ text_fic.txt: (273, 1)
📂 Processing mag...
  ✅ text_mag.txt: (948, 1)
📂 Processing news...
  ✅ text_news.txt: (871, 1)
📂 Processing spok...
  ✅ text_mag.txt: (948, 1)
📂 Processing news...
  ✅ text_news.txt: (871, 1)
📂 Processing spok...
  ✅ text_spok.txt: (263, 1)
📂 Processing tvm...
  ✅ text_tvm.txt: (233, 1)
📂 Processing web...
  ✅ text_web.txt: (892, 1)

🎯 SUMMARY:
   - test: 8 genres loaded
   - Total corpora in collection: 1
  ✅ text_spok.txt: (263, 1)
📂 Processing tvm...
  ✅ text_tvm.txt: (233, 1)
📂 Processing web...
  ✅ text_web.txt: (892, 1)

🎯 SUMMARY:
   - test: 8 genres loaded
   - Total corpora in collection: 1


____________________________
## Search Keyword 

- using `bovine` as a test keyword across the full COCA corpus
- COMPARE YOUR RESULTS TO THE OUTPUT HERE, IF POSSIBLE: https://www.english-corpora.org/coca/
  - I get sometimes less and sometimes more hits! TBD and needs review...


### Comparing parallel vs non-parallel kwic search

- the `n_jobs` parameter will automatically use n-1 cores to use all but one of your CPU cores. This leads to much better performance on large corpora.
- i.e. for `bovine` on the full COCA text corpus, I get (10-1=9 CPU cores):
  - non-parallel: time elapsed: 0 days 00:01:01.157718
  - parallel: time elapsed: 0 days 00:00:22.578978
  - almost 3x faster!
  - ![https://upload.wikimedia.org/wikipedia/commons/8/8c/Cow_%28Fleckvieh_breed%29_Oeschinensee_Slaunger_2009-07-07.jpg](https://upload.wikimedia.org/wikipedia/commons/8/8c/Cow_%28Fleckvieh_breed%29_Oeschinensee_Slaunger_2009-07-07.jpg)

In [12]:
# how about 'etienne' or 'amethyst'
#print timestamps before nad after
import pandas as pd
before = pd.Timestamp.now()
etienne_results = got3.search_keyword_corpus('bovine', coca_corpus, 
                                            case_sensitive=False,
                                            show_context=True, 
                                            context_words=10,
                                            output='print',
                                            parallel=False)
after = pd.Timestamp.now()
print('time elapsed:', after - before)
etienne_results

🔍 COCA Corpus Search: 'bovine'

� MAG_1993 :
------------------------------
  📝 Text 92: N.J. , company called Enzon has a different technique for **bovine** hemoglobin : It tags the molecule with a polyethylene glycol
  📝 Text 278: them the finest period in Saharan art , the so-called **Bovine** : they drew realistic cows and graceful human beings .
  📝 Text 381: employs a mere 2,639 people . The genetic-engineering industry -- **bovine** growth hormones that produce supercows and in vitro laboratory production
  📝 Text 1047: number of ailments : pasteurella , moraxella , bluetongue , **bovine** sinusoidal respiratory virus , and epizootic hemorrhagic disease . Some
  📝 Text 1269: around four years from now , you 'll notice the **bovine** expressions , @ @ @ @ @ @ @ @
  📝 Text 1270: Obligado ( Simon &; Schuster , $14 ) One balky **bovine** gives only the meagerest supply of milk , so she
  📝 Text 1358: In November , the federal government approved the use of **bovine** somatotropin , 

KeyboardInterrupt: 

In [13]:
# how about 'etienne' or 'amethyst'
#print timestamps before nad after
import pandas as pd
before = pd.Timestamp.now()
etienne_results = got3.search_keyword_corpus('bovine', coca_corpus, 
                                            case_sensitive=False,
                                            show_context=True, 
                                            context_words=15,
                                            output='print',
                                            parallel=True)
after = pd.Timestamp.now()
print('time elapsed:', after - before)
etienne_results

🔍 COCA Corpus Search: 'bovine'
🚀 Using parallel processing with 9 processes...

📚 MAG_1993 :
------------------------------
  📝 Text 92: . A South Plainfield , N.J. , company called Enzon has a different technique for **bovine** hemoglobin : It tags the molecule with a polyethylene glycol ( PEG ) chemical group
  📝 Text 278: domesticated cattle , bringing with them the finest period in Saharan art , the so-called **Bovine** : they drew realistic cows and graceful human beings . Because of similarities of hairstyle
  📝 Text 381: the largest biotechnology company , employs a mere 2,639 people . The genetic-engineering industry -- **bovine** growth hormones that produce supercows and in vitro laboratory production of basic fruits and vegetables
  📝 Text 1047: the antibodies , for a number of ailments : pasteurella , moraxella , bluetongue , **bovine** sinusoidal respiratory virus , and epizootic hemorrhagic disease . Some of these can be transmitted
  📝 Text 1269: of the Notre Dame team a

{'mag_1993': [{'text_id': 92,
   'match': 'bovine',
   'context': '. A South Plainfield , N.J. , company called Enzon has a different technique for **bovine** hemoglobin : It tags the molecule with a polyethylene glycol ( PEG ) chemical group',
   'full_text': 'At 10:30 on Monday morning in Operating Room 10 on the third floor of Boston University Medical Cent...'},
  {'text_id': 278,
   'match': 'Bovine',
   'context': 'domesticated cattle , bringing with them the finest period in Saharan art , the so-called **Bovine** : they drew realistic cows and graceful human beings . Because of similarities of hairstyle',
   'full_text': 'Days from Djanet , in the wild and mountainous desert of southeastern Algeria , the pickup blew a ti...'},
  {'text_id': 381,
   'match': 'bovine',
   'context': 'the largest biotechnology company , employs a mere 2,639 people . The genetic-engineering industry -- **bovine** growth hormones that produce supercows and in vitro laboratory production of basic frui

In [14]:
coca_corpus.keys()

dict_keys(['mag', 'web', 'acad', 'news', 'spok', 'blog', 'fic', 'tvm'])

In [15]:
coca_corpus.keys()
# probably need to flatten the coca_corpus dict for this to work

got3.keyword_frequency_analysis('bovine', 
                                coca_corpus, 
                                case_sensitive=False)

📊 Frequency Analysis for 'bovine' (case_sensitive=False, loose substring match)
  acad    :    501 hits | 140449282 tokens | 0.04 /10k
  web     :    208 hits | 149036464 tokens | 0.01 /10k
  mag     :    162 hits | 146417442 tokens | 0.01 /10k
  fic     :    109 hits | 142585624 tokens | 0.01 /10k
  blog    :     92 hits | 143156927 tokens | 0.01 /10k
  tvm     :     71 hits | 162287598 tokens | 0.00 /10k
  news    :     68 hits | 143377305 tokens | 0.00 /10k
  spok    :     41 hits | 151501397 tokens | 0.00 /10k
------------------------------------------------------------
TOTAL: 1252 hits across 8 genres (~1178812039 tokens)


{'keyword': 'bovine',
 'total_count': 1252,
 'by_genre': [{'genre': 'acad',
   'count': 501,
   'tokens': 140449282,
   'years': {'2013': {'count': 56, 'tokens': 4216442},
    '2007': {'count': 1, 'tokens': 4951942},
    '2006': {'count': 14, 'tokens': 4613959},
    '2012': {'count': 62, 'tokens': 5221661},
    '2004': {'count': 6, 'tokens': 4602437},
    '2010': {'count': 4, 'tokens': 4491873},
    '2011': {'count': 7, 'tokens': 5273139},
    '2005': {'count': 8, 'tokens': 4500905},
    '2001': {'count': 3, 'tokens': 4525130},
    '2015': {'count': 59, 'tokens': 4306625},
    '2014': {'count': 43, 'tokens': 4109008},
    '2000': {'count': 8, 'tokens': 4671365},
    '2016': {'count': 13, 'tokens': 4736574},
    '2002': {'count': 5, 'tokens': 4652147},
    '2003': {'count': 3, 'tokens': 4644495},
    '2017': {'count': 64, 'tokens': 4836532},
    '1999': {'count': 14, 'tokens': 4586711},
    '1998': {'count': 5, 'tokens': 4686205},
    '1996': {'count': 3, 'tokens': 4702132},
    '1997':

In [16]:
# Demo: Debugging keyword search in COCA corpus
# Print a few sample texts from a genre/year to check formatting
print('Sample texts from MAG 2018:')
print(coca_corpus['mag']['2018']['text'].head(10).to_list())

# Try a simple substring search for 'bovine' in the first 100 texts
sample_texts = coca_corpus['mag']['2018']['text'].head(100)
hits = [t for t in sample_texts if 'bovine' in str(t).lower()]
print(f"Found {len(hits)} sample hits for 'bovine' in first 100 MAG 2018 texts.")

# Try a less strict regex (no word boundaries)
import re
pattern = re.compile('bovine', re.IGNORECASE)
count = 0
for t in coca_corpus['mag']['2018']['text']:
    count += len(pattern.findall(str(t)))
print(f"Total 'bovine' matches in MAG 2018 (no word boundaries): {count}")

# print the lines with bovine
for t in coca_corpus['mag']['2018']['text']:
    if pattern.search(str(t)):
        print(t)
        

Sample texts from MAG 2018:
['<h> Amazon \'s Echo Device Chief on the Risk of Alexa \'s Many Rewards <p> Imagine an always-on digital-assistant who is friendly , will turn off your lights , order from your shopping list , can chat about anything from the World Cup to Japanese anime , and who will also know to cheer you up when you \'re feeling blue . This is part of the array of delights that Toni Reid , Amazon \'s vice president of Alexa experience and Echo devices , thinks about when she considers the future of the smart speaker she \'s overseen from its earliest inception . <p> It starts with the human element . " What surprised us was how many times people were asking questions that did n\'t need an answer , " Reid told the crowd at Fortune \'s Fortune \'s Brainstorm Tech conference in Aspen , Colo on Monday . People so often share things like expressions of loneliness and frustration , or a romantic longing for " Alexa " herself , that their editorial team spends a lot of time thi

In [17]:
coca_corpus['mag']['2018']['text'].head()

0    <h> Amazon 's Echo Device Chief on the Risk of...
1    <h> Walmart Bets On Ever Faster Delivery to Co...
2    <p> Jul 17 , 2018 ( marketresearch.biz via COM...
3    <p> President Trump bungled the Helsinki press...
4    <h> A Sense of Discovery : How the Immune Syst...
Name: text, dtype: object