In [1]:
pip install getout_of_text_3


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Demonstration notebook for functionalities to include on GOT3 tool

- working through some steps with `pandas` and`nltk`, to later roll into the toolset `getout_of_text_3` to streamline COCA Corpora searching

In [12]:
import getout_of_text_3 as got3
got3.__version__

'0.1.3'

In [13]:
import pandas as pd
# Import necessary libraries for search functionality
import nltk
import re
from collections import defaultdict, Counter

In [18]:
# Download NLTK data
try:
    nltk.download('punkt_tab', quiet=True)
    nltk.download('stopwords', quiet=True)
    print("✅ NLTK libraries ready!")
except:
    print("⚠️ NLTK download may have failed, but will continue")

# Test tokenization
try:
    test_words = nltk.word_tokenize("This is a test.")
    print(f"✅ Tokenization working: {test_words}")
except Exception as e:
    print(f"⚠️ Tokenization issue: {e}")
    print("Will use simple split() method as fallback")

✅ NLTK libraries ready!
✅ Tokenization working: ['This', 'is', 'a', 'test', '.']


## reading coca db and txt
- dictionaries with genre as the key and dfs as the value

In [14]:
genre_dict = ['acad', 'blog', 'fic', 
              'mag', 'news', 'spok',
              'tvm', 'web']
db_df = {}
db_text = {}

In [None]:
for genre in genre_dict:
    print(f"📂 Processing {genre}...")
    
    # Load db file
    try:
        db_df[genre] = pd.read_csv("../coca-samples-db/db_{}.txt".format(genre), 
                                   sep="\t", 
                                   header=None, 
                                   names=["text"],
                                   on_bad_lines='skip',
                                   quoting=3)
        print(f"  ✅ db_{genre}.txt: {db_df[genre].shape}")
    except Exception as e:
        print(f"  ❌ Error reading db_{genre}: {e}")
    
    # Load text file
    try:
        db_text[genre] = pd.read_csv("../coca-samples-text/text_{}.txt".format(genre), 
                                     sep="\t", 
                                     header=None, 
                                     names=["text"],
                                     on_bad_lines='skip',
                                     quoting=3)
        print(f"  ✅ text_{genre}.txt: {db_text[genre].shape}")
    except Exception as e:
        print(f"  ❌ Error reading text_{genre}: {e}")

print(f"\n🎯 SUMMARY:")
print(f"   - db_df: {len(db_df)} genres loaded") 
print(f"   - db_text: {len(db_text)} genres loaded")
print(f"   - Processed each genre exactly once ✅")

Reading files for... acad
  db_acad.txt: (1419500, 1)
  text_acad.txt: (265, 1)
Reading files for... blog
  db_acad.txt: (1419500, 1)
  text_acad.txt: (265, 1)
Reading files for... blog
  db_blog.txt: (1586094, 1)
  text_blog.txt: (991, 1)
Reading files for... fic
  db_blog.txt: (1586094, 1)
  text_blog.txt: (991, 1)
Reading files for... fic
  db_fic.txt: (1405902, 1)
  text_fic.txt: (273, 1)
Reading files for... mag
  db_fic.txt: (1405902, 1)
  text_fic.txt: (273, 1)
Reading files for... mag
  db_mag.txt: (1567102, 1)
  text_mag.txt: (948, 1)
Reading files for... news
  db_mag.txt: (1567102, 1)
  text_mag.txt: (948, 1)
Reading files for... news
  db_news.txt: (1389753, 1)
  text_news.txt: (871, 1)
Reading files for... spok
  db_news.txt: (1389753, 1)
  text_news.txt: (871, 1)
Reading files for... spok
  db_spok.txt: (1160506, 1)
  text_spok.txt: (263, 1)
Reading files for... tvm
  db_spok.txt: (1160506, 1)
  text_spok.txt: (263, 1)
Reading files for... tvm
  db_tvm.txt: (1567561, 1)
 

## For collocate or keyword searches, we can use the following approach:
1. loop through each genre
2. do string filter hits for each instance of a string match in the dictionary key dataframe text column
3. print out in an elegant manner

use NLTK for this if that makes things easier!

In [None]:
def search_keyword_corpus(keyword, db_dict, case_sensitive=False, show_context=True, context_words=5):
    """
    Search for a keyword across all COCA genres and display results elegantly.
    
    Parameters:
    - keyword: The word/phrase to search for
    - db_dict: Dictionary of DataFrames (either db_df or db_text)
    - case_sensitive: Whether to perform case-sensitive search
    - show_context: Whether to show surrounding context
    - context_words: Number of words to show on each side for context
    
    Returns:
    - Dictionary with search results by genre
    """
    
    print(f"🔍 COCA Corpus Search: '{keyword}'")
    print("=" * 60)
    
    results = defaultdict(list)
    total_hits = 0
    
    # Prepare search pattern
    if case_sensitive:
        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b')
    else:
        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
    
    # Search through each genre
    for genre, df in db_dict.items():
        genre_hits = 0
        print(f"\n📚 {genre.upper()} Genre:")
        print("-" * 30)
        
        for idx, text in df['text'].items():
            text_str = str(text)
            matches = pattern.findall(text_str)
            
            if matches:
                genre_hits += len(matches)
                
                if show_context:
                    # Find all match positions and show context
                    for match in pattern.finditer(text_str):
                        start, end = match.span()
                        
                        # Get context words
                        words = text_str.split()
                        text_words = ' '.join(words)
                        
                        # Find word boundaries for context
                        words_before_match = text_str[:start].split()
                        words_after_match = text_str[end:].split()
                        
                        # Build context
                        context_before = ' '.join(words_before_match[-context_words:]) if words_before_match else ""
                        matched_word = text_str[start:end]
                        context_after = ' '.join(words_after_match[:context_words]) if words_after_match else ""
                        
                        # Format the context nicely
                        context_display = f"...{context_before} **{matched_word}** {context_after}..."
                        context_display = context_display.replace("...", "").strip()
                        
                        results[genre].append({
                            'text_id': idx,
                            'match': matched_word,
                            'context': context_display,
                            'full_text': text_str[:100] + "..." if len(text_str) > 100 else text_str
                        })
                        
                        print(f"  📝 Text {idx}: {context_display}")
                else:
                    results[genre].append({
                        'text_id': idx,
                        'matches': len(matches),
                        'full_text': text_str[:100] + "..." if len(text_str) > 100 else text_str
                    })
        
        if genre_hits > 0:
            print(f"  ✅ Found {genre_hits} occurrence(s) in {genre}")
        else:
            print(f"  ❌ No matches found in {genre}")
            
        total_hits += genre_hits
    
    print(f"\n🎯 SUMMARY:")
    print(f"Total hits across all genres: {total_hits}")
    print(f"Genres with matches: {len([g for g in results if results[g]])}")
    
    return dict(results)

# Helper function for frequency analysis
def keyword_frequency_analysis(keyword, db_dict, case_sensitive=False):
    """
    Analyze frequency of keyword across genres
    """
    print(f"📊 Frequency Analysis for '{keyword}'")
    print("=" * 50)
    
    freq_data = {}
    
    if case_sensitive:
        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b')
    else:
        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
    
    for genre, df in db_dict.items():
        total_words = 0
        keyword_count = 0
        
        for text in df['text']:
            text_str = str(text)
            words = text_str.split()
            total_words += len(words)
            keyword_count += len(pattern.findall(text_str))
        
        # Calculate frequency per 1000 words
        freq_per_1000 = (keyword_count / total_words * 1000) if total_words > 0 else 0
        
        freq_data[genre] = {
            'count': keyword_count,
            'total_words': total_words,
            'freq_per_1000': round(freq_per_1000, 3)
        }
        
        print(f"{genre:8s}: {keyword_count:4d} occurrences | {freq_per_1000:6.3f} per 1000 words")
    
    return freq_data

print("✅ Search functions created successfully!")

✅ Search functions created successfully!


In [None]:
# Example 1: Search for a legal term across all genres
keyword = "textual"
search_results = search_keyword_corpus(keyword, db_text, case_sensitive=False, show_context=True, context_words=5)

🔍 COCA Corpus Search: 'textual'

📚 ACAD Genre:
------------------------------
  📝 Text 4: New Testaments , of multiple **textual** layers . In fact ,
  📝 Text 4: little evolving world of complex **textual** strata . As in The
  📝 Text 4: its own complicated pastiche of **textual** fragments recounted by at least
  📝 Text 31: basis for multiple languages -- **textual** , graphic , photographic ,
  📝 Text 73: describes television as " the **textual** technology of information theory ,
  📝 Text 78: a chorus-commentary underline the irritating **textual** bombardment , while a stereophonic
  📝 Text 87: distinction between an intrinsic , **textual** " you " -- a
  📝 Text 87: this page " is both **textual** and extratextual : it refers
  📝 Text 87: play with the location ( **textual** and/or extratextual ) of the
  📝 Text 87: there is some evidence ( **textual** or historical ) to the
  📝 Text 87: there is some evidence ( **textual** or historical ) to the
  📝 Text 87: that involves discussi

In [24]:
# Example 2: Frequency analysis across genres
print("\n" + "="*60)
freq_results = keyword_frequency_analysis(keyword, db_text, case_sensitive=False)


📊 Frequency Analysis for 'textual'
acad    :   21 occurrences |  0.015 per 1000 words
blog    :    2 occurrences |  0.001 per 1000 words
fic     :    1 occurrences |  0.001 per 1000 words
mag     :    0 occurrences |  0.000 per 1000 words
fic     :    1 occurrences |  0.001 per 1000 words
mag     :    0 occurrences |  0.000 per 1000 words
news    :    0 occurrences |  0.000 per 1000 words
spok    :    0 occurrences |  0.000 per 1000 words
tvm     :    0 occurrences |  0.000 per 1000 words
news    :    0 occurrences |  0.000 per 1000 words
spok    :    0 occurrences |  0.000 per 1000 words
tvm     :    0 occurrences |  0.000 per 1000 words
web     :    2 occurrences |  0.001 per 1000 words
web     :    2 occurrences |  0.001 per 1000 words


In [61]:
def find_collocates(keyword, db_dict, window_size=5, min_freq=2, case_sensitive=False):
    """
    Find words that frequently appear near the keyword (collocates)
    
    Parameters:
    - keyword: Target word to find collocates for
    - db_dict: Dictionary of DataFrames
    - window_size: Number of words to look at on each side
    - min_freq: Minimum frequency for a word to be considered a collocate
    - case_sensitive: Whether to perform case-sensitive search
    """
    print(f"🔗 Collocate Analysis for '{keyword}' (window: ±{window_size} words)")
    print("=" * 60)
    
    if case_sensitive:
        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b')
    else:
        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
    
    all_collocates = Counter()
    genre_collocates = {}

    for genre, df in db_dict.items():
        print(f"\n📚 {genre.upper()} Genre Collocates:")
        
        # Create a fresh counter for each genre
        genre_counter = Counter()
        keyword_instances = 0
        
        for text in df['text']:
            text_str = str(text).lower() if not case_sensitive else str(text)
            words = nltk.word_tokenize(text_str)
            
            # Find all positions of the keyword
            keyword_positions = []
            for i, word in enumerate(words):
                if (not case_sensitive and word.lower() == keyword.lower()) or (case_sensitive and word == keyword):
                    keyword_positions.append(i)
            
            keyword_instances += len(keyword_positions)
            
            # Extract collocates around each keyword occurrence
            for pos in keyword_positions:
                start = max(0, pos - window_size)
                end = min(len(words), pos + window_size + 1)
                
                # Get surrounding words (excluding the keyword itself)
                context_words = words[start:pos] + words[pos+1:end]
                
                # Filter out punctuation and very short words
                context_words = [w for w in context_words if w.isalpha() and len(w) > 2]
                
                genre_counter.update(context_words)
                all_collocates.update(context_words)
        
        # Store the results for this genre
        genre_collocates[genre] = genre_counter
        
        # Display top collocates for this genre
        top_collocates = genre_counter.most_common(10)
        if top_collocates:
            print(f"  Found {keyword_instances} instances of '{keyword}' in {genre}")
            # Show all results, but mark those below min_freq
            for word, freq in top_collocates:
                marker = "  " if freq >= min_freq else "* "
                print(f"{marker}{word:15s}: {freq:3d} times")
        else:
            print(f"  Found {keyword_instances} instances, but no significant collocates")
    
    print(f"\n🎯 TOP OVERALL COLLOCATES (min frequency: {min_freq}):")
    print("-" * 40)
    top_overall = all_collocates.most_common(20)
    for word, freq in top_overall:
        if freq >= min_freq:
            print(f"{word:15s}: {freq:3d} occurrences")
        
    return {
        'all_collocates': dict(all_collocates),
        'by_genre': dict(genre_collocates),
        #'top_overall': top_overall
    }

# Example 3: Find collocates for the keyword (CLEANED VERSION)
print("\n" + "="*60)
print("🔧 CLEANED COLLOCATE ANALYSIS:")
collocate_results = find_collocates('help', db_text, window_size=3, min_freq=1)


🔧 CLEANED COLLOCATE ANALYSIS:
🔗 Collocate Analysis for 'help' (window: ±3 words)

📚 ACAD Genre Collocates:
  Found 434 instances of 'help' in acad
  the            : 126 times
  and            :  67 times
  can            :  56 times
  that           :  42 times
  with           :  36 times
  students       :  36 times
  will           :  28 times
  may            :  27 times
  for            :  27 times
  them           :  20 times

📚 BLOG Genre Collocates:
  Found 701 instances of 'help' in blog
  the            : 189 times
  you            :  98 times
  and            :  97 times
  that           :  69 times
  can            :  55 times
  with           :  51 times
  will           :  50 times
  for            :  47 times
  get            :  33 times
  your           :  33 times

📚 FIC Genre Collocates:
  Found 412 instances of 'help' in fic
  the            :  80 times
  you            :  74 times
  could          :  44 times
  and            :  41 times
  with           :  32 tim

### think about how it's helpful to have these various results displayed / consumed by the user

The `got3` tool should have the following included steps for ease of access with working with COCA from BYU

1. read the database files `got3.read_corpora(dir_of_text_files,corpora_name)`
2. perform collocate analysis using `got3.find_collocates(keyword, db_dict, window_size, min_freq, case_sensitive)`
3. perform keyword search using `got3.search_keyword_corpus(keyword, db_dict, case_sensitive, show_context, context_words)`
4. perform keyword frequency analysis using `got3.keyword_frequency_analysis(keyword, db_dict, case_sensitive)`