## Work with our Tri-Gram Files

In [1]:
# import what you need
import pandas as pd
from pathlib import Path
import os
import dask.dataframe as dd

# Didn't end up using this much, I just made the notebook talk alot
verbose=False

# what are the dataset IDs
# In the same directory as this notebook you should have a tri-gram CSV and matadata CSV for each dataset listed
datasets = [
        "14334ee2-a661-5ca1-7186-abd90a3b4208",
        "86467f7b-7428-f968-7307-4ca3512c6116",
        "cbd0318c-73b5-141a-0b64-65a0a4662189"
    ]


## Function to combine multiple CSV files into one

Combines multiple CSV files with identical structure into a single CSV file.
    
    Args:
        input_files (list): List of paths to input CSV files
        output_file (str): Path where the combined CSV will be saved
        chunk_size (int): Number of rows to process at a time

In [2]:
def combine_csv_files(input_files, output_file, chunk_size=100000):
    print("\nStarting CSV combination process...")
    
    # Write header first
    first_chunk = next(pd.read_csv(input_files[0] + "-trigrams.csv", chunksize=1))
    first_chunk.columns.tolist()
    
    # Add publicationYear column to header
    columns = list(first_chunk.columns) + ['publicationYear']
    
    # Initialize the output file with headers
    with open(output_file, 'w', newline='') as f:
        f.write(','.join(columns) + '\n')
    
    total_rows = 0
    
    # Process each input file
    for file_num, file in enumerate(input_files, 1):
        print(f"\nProcessing file #{file_num}: {file}")
        
        # Read metadata once for this file
        metadata_df = pd.read_csv(file + "-metadata.csv")
        year_mapping = metadata_df.set_index('id')['publicationYear'].to_dict()
        
        # Process the trigrams file in chunks
        chunks = pd.read_csv(file + "-trigrams.csv", chunksize=chunk_size)
        
        for i, chunk in enumerate(chunks):
            # Add publication year to the chunk
            chunk['publicationYear'] = chunk['id'].map(year_mapping)
            
            # Append to output file
            chunk.to_csv(output_file, mode='a', header=False, index=False)
            
            # Update progress
            total_rows += len(chunk)
            if (i + 1) % 10 == 0:
                print(f"Processed {total_rows:,} rows...", end='\r')
    
    print(f"\nCompleted! Total rows processed: {total_rows:,}")

## Function to search the data for phrases
Search through trigrams in chunks to minimize Memory usage

In [3]:
def search_trigrams(filename, chunk_size=100000):
    print("\nLet's do some searching of your dataset.")
    
    # Initialize counters for total documents
    unique_docs = set()
    
    # Count total unique documents first
    print("\nCounting unique documents...")
    for chunk in pd.read_csv(filename, chunksize=chunk_size):
        unique_docs.update(chunk['id'].unique())
    
    total_unique_docs = len(unique_docs)
    print(f"Total unique documents: {total_unique_docs:,}")
    
    while True:
        phrase = input("\nENTER A SEARCH TERM (OR 'quit' to exit)? -->")
        if phrase.lower() == "quit":
            break
            
        print(f"\nSearching for '{phrase}' in your dataset...")
        
        # Initialize counters
        total_matches = 0
        matching_docs = set()
        matches_data = []
        
        # Process in chunks
        for chunk in pd.read_csv(filename, chunksize=chunk_size):
            # Find matches in this chunk
            chunk_matches = chunk[chunk['ngram'].str.contains(phrase, case=False, na=False)]
            
            # Update counters
            total_matches += len(chunk_matches)
            matching_docs.update(chunk_matches['id'].unique())
            
            # Store relevant data for summary
            if not chunk_matches.empty:
                matches_data.append(chunk_matches[['id', 'publicationYear', 'ngram', 'count']])
        
        # Print results
        print(f"\nSummary Statistics for '{phrase}':")
        print(f"\tTotal trigram matches: {total_matches:,}")
        print(f"\tNumber of unique documents with matches: {len(matching_docs):,}")
        print(f"\tTotal number of unique documents in dataset: {total_unique_docs:,}")
        
        # Create and save document summary if we found matches
        if matches_data:
            matches_df = pd.concat(matches_data, ignore_index=True)
            doc_summary = matches_df.groupby(['id', 'publicationYear'])['count'].sum().reset_index()
            doc_summary = doc_summary.sort_values(['count', 'publicationYear'], ascending=[False, True])
            
            output_filename = f"{phrase}_document_summary.csv"
            doc_summary.to_csv(output_filename, index=False)
            print(f"\nDocument-level summary saved to: {output_filename}")
            
            # Display sample of matches
            print("\nSample of matching trigrams:")
            print(matches_df.head())
        else:
            print("\nNo matches found.")


## Let's build the combined CSV and get some stats

In [None]:
# What would be "main" in Perl
print("\nRUNNING ...")

print("\nYou have told me to combine these files: " + str(datasets))

# Specify output file
output = "combined_output.csv"

print("And to output the result to this file: " + output + "\n")

# Check if file exists
if os.path.exists(output):
    response = input(f"FILE {output} ALREADY EXISTS. DO YOU WANT TO REBUILD AND OVERWRITE IT (could take awhile) OR USE IT? (rebuild/use) --> ")
    if response.lower() == 'rebuild':
        print("\nGreat, let's recreate the combined file.")
        combine_csv_files(datasets, output)
else:
    print("\nGreat, let's create the combined file.")
    combine_csv_files(datasets, output)

# OK, now we've combined the tri-gram CSV file
# and brought in the publication year from the metadata CSV files
# do you want to search for phrases?
search_trigrams(output)



RUNNING ...

You have told me to combine these files: ['14334ee2-a661-5ca1-7186-abd90a3b4208', '86467f7b-7428-f968-7307-4ca3512c6116', 'cbd0318c-73b5-141a-0b64-65a0a4662189']
And to output the result to this file: combined_output.csv



FILE combined_output.csv ALREADY EXISTS. DO YOU WANT TO REBUILD AND OVERWRITE IT (could take awhile) OR USE IT? (rebuild/use) -->  rebuild



Great, let's recreate the combined file.

Starting CSV combination process...

Processing file #1: 14334ee2-a661-5ca1-7186-abd90a3b4208
Processed 173,000,000 rows...
Processing file #2: 86467f7b-7428-f968-7307-4ca3512c6116
Processed 297,055,732 rows...
Processing file #3: cbd0318c-73b5-141a-0b64-65a0a4662189
Processed 364,777,132 rows...
Completed! Total rows processed: 365,655,558

Let's do some searching of your dataset.

Counting unique documents...
Total unique documents: 104,944



ENTER A SEARCH TERM (OR 'quit' to exit)? --> dewey



Searching for 'dewey' in your dataset...

Summary Statistics for 'dewey':
	Total trigram matches: 105,600
	Number of unique documents with matches: 9,722
	Total number of unique documents in dataset: 104,944

Document-level summary saved to: dewey_document_summary.csv

Sample of matching trigrams:
                                    id  publicationYear  \
0   http://www.jstor.org/stable/431489             1991   
1   http://www.jstor.org/stable/431489             1991   
2   http://www.jstor.org/stable/431489             1991   
3  http://www.jstor.org/stable/3840566             2006   
4  http://www.jstor.org/stable/3840566             2006   

                     ngram  count  
0             as Dewey and      1  
1         Dewey and Cizek.      1  
2            such as Dewey      1  
3  from Dewey's conviction      1  
4    Dewey, and especially      1  



ENTER A SEARCH TERM (OR 'quit' to exit)? --> wittgenstein



Searching for 'wittgenstein' in your dataset...

Summary Statistics for 'wittgenstein':
	Total trigram matches: 159,018
	Number of unique documents with matches: 9,990
	Total number of unique documents in dataset: 104,944

Document-level summary saved to: wittgenstein_document_summary.csv

Sample of matching trigrams:
                                    id  publicationYear  \
0  http://www.jstor.org/stable/2953772             1997   
1  http://www.jstor.org/stable/2953772             1997   
2  http://www.jstor.org/stable/2953772             1997   
3  http://www.jstor.org/stable/2953772             1997   
4  http://www.jstor.org/stable/2953772             1997   

                          ngram  count  
0      Wittgenstein. London and      1  
1       S. Wittgenstein's Place      1  
2  Practices: A Wittgensteinian      1  
3       to Wittgenstein. London      1  
4   Wittgensteinian Approach to      1  



ENTER A SEARCH TERM (OR 'quit' to exit)? --> pragmatism



Searching for 'pragmatism' in your dataset...
