# Chunking Beige Books

In [9]:
import os
import nltk

# Download NLTK tokenizer data if not already done
#nltk.download('punkt')

# Define the directory path
directory_path = 'C:/Users/MCOB PHD 14/Dropbox/Charlie\'s Dissertation/Beige Books'

# Function to split text into 256-word chunks
def split_text_into_chunks(text, chunk_size=256):
    # Tokenize text into words
    words = nltk.word_tokenize(text)
    
    # Split the words into chunks of specified size
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    
    return chunks

# Function to read all text files from a directory, process them, and save the chunks
def process_beige_books(directory_path):
    # Get a list of all text files in the directory
    text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    total_files = len(text_files)
    
    print(f"Found {total_files} text files in the directory.")
    
    for file_count, filename in enumerate(text_files, 1):
        print(f"Processing file {file_count}/{total_files}: {filename}")
        
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)
        
        # Read the file content
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Split the text into 256-word chunks
        chunks = split_text_into_chunks(text)
        
        # Save each chunk to a new file or handle as needed
        for i, chunk in enumerate(chunks):
            chunk_filename = f"{os.path.splitext(filename)[0]}_chunk_{i+1}.txt"
            chunk_file_path = os.path.join(directory_path, 'chunks', chunk_filename)
            
            # Ensure the chunks directory exists
            os.makedirs(os.path.dirname(chunk_file_path), exist_ok=True)
            
            # Write the chunk to a new file
            with open(chunk_file_path, 'w', encoding='utf-8') as chunk_file:
                chunk_file.write(chunk)
        
        print(f"Completed processing of file {file_count}/{total_files}: {filename}")
    
    print("Processing complete.")


In [10]:
# Run the process
process_beige_books(directory_path)

Found 6116 text files in the directory.
Processing file 1/6116: 1970_at (1).txt
Completed processing of file 1/6116: 1970_at (1).txt
Processing file 2/6116: 1970_at (2).txt
Completed processing of file 2/6116: 1970_at (2).txt
Processing file 3/6116: 1970_at (3).txt
Completed processing of file 3/6116: 1970_at (3).txt
Processing file 4/6116: 1970_at (4).txt
Completed processing of file 4/6116: 1970_at (4).txt
Processing file 5/6116: 1970_at (5).txt
Completed processing of file 5/6116: 1970_at (5).txt
Processing file 6/6116: 1970_at (6).txt
Completed processing of file 6/6116: 1970_at (6).txt
Processing file 7/6116: 1970_at (7).txt
Completed processing of file 7/6116: 1970_at (7).txt
Processing file 8/6116: 1970_at (8).txt
Completed processing of file 8/6116: 1970_at (8).txt
Processing file 9/6116: 1970_bo (1).txt
Completed processing of file 9/6116: 1970_bo (1).txt
Processing file 10/6116: 1970_bo (2).txt
Completed processing of file 10/6116: 1970_bo (2).txt
Processing file 11/6116: 197

In [1]:
# Randomly select 1,000 chunks

import os
import random
import shutil

# Define the directory paths
chunks_directory = 'C:/Users/MCOB PHD 14/Dropbox/Charlie\'s Dissertation/Beige Books/chunks'
selected_chunks_directory = 'C:/Users/MCOB PHD 14/Dropbox/Charlie\'s Dissertation/Beige Books/selected_chunks'

# Function to randomly select and copy chunks
def select_and_copy_chunks(chunks_directory, selected_chunks_directory, num_chunks=1000):
    # List all chunk files in the chunks directory
    chunk_files = [f for f in os.listdir(chunks_directory) if f.endswith('.txt')]
    
    # Ensure the selected chunks directory exists
    os.makedirs(selected_chunks_directory, exist_ok=True)
    
    # Randomly select the specified number of chunks
    selected_files = random.sample(chunk_files, num_chunks)
    
    # Copy the selected chunks to the new directory
    for file in selected_files:
        src_path = os.path.join(chunks_directory, file)
        dst_path = os.path.join(selected_chunks_directory, file)
        shutil.copy(src_path, dst_path)
    
    print(f"Selected {num_chunks} chunks and copied them to {selected_chunks_directory}.")

# Run the selection and copying process
select_and_copy_chunks(chunks_directory, selected_chunks_directory)


Selected 1000 chunks and copied them to C:/Users/MCOB PHD 14/Dropbox/Charlie's Dissertation/Beige Books/selected_chunks.


In [2]:
# Import the randomlly selected chunks into a dataframe
import pandas as pd

# Define the directory path
selected_chunks_directory = 'C:/Users/MCOB PHD 14/Dropbox/Charlie\'s Dissertation/Beige Books/selected_chunks'

# Function to read all text files from a directory and return as a DataFrame
def read_chunks_into_dataframe(directory_path):
    # Get a list of all text files in the directory
    text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    
    # Initialize an empty list to store the data
    data = []
    
    # Read each file and store its content along with the filename
    for filename in text_files:
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            data.append({'filename': filename, 'text': text})
    
    # Create a DataFrame from the list of data
    df = pd.DataFrame(data)
    
    return df

# Read the selected chunks into a DataFrame
selected_chunks_df = read_chunks_into_dataframe(selected_chunks_directory)

# Display the DataFrame
selected_chunks_df.head()

Unnamed: 0,filename,text
0,1970_at (7)_chunk_1.txt,"June 17 , 1970 Summary of Findings Sixth Distr..."
1,1970_bo (4)_chunk_2.txt,of jewelry manufacturers in southern Massachus...
2,1970_ch (1)_chunk_4.txt,rates and terms have not eased much . There ar...
3,1970_ch (5)_chunk_2.txt,over the slowness of payments on receivables b...
4,1970_ch (7)_chunk_2.txt,"outside Chicago , department stores report int..."


In [6]:
# Create a column for year based on the first 4 characters of the filename
selected_chunks_df['year'] = selected_chunks_df['filename'].str[:4]

# Save year as a datetime object
selected_chunks_df['year'] = pd.to_datetime(selected_chunks_df['year'], format='%Y')

# Create a district column based on the filename's 6th and 7th characters
selected_chunks_df['district'] = selected_chunks_df['filename'].str[5:7]

# Display the updated DataFrame
selected_chunks_df.head()

Unnamed: 0,filename,text,year,district
0,1970_at (7)_chunk_1.txt,"June 17 , 1970 Summary of Findings Sixth Distr...",1970-01-01,at
1,1970_bo (4)_chunk_2.txt,of jewelry manufacturers in southern Massachus...,1970-01-01,bo
2,1970_ch (1)_chunk_4.txt,rates and terms have not eased much . There ar...,1970-01-01,ch
3,1970_ch (5)_chunk_2.txt,over the slowness of payments on receivables b...,1970-01-01,ch
4,1970_ch (7)_chunk_2.txt,"outside Chicago , department stores report int...",1970-01-01,ch


In [17]:
# Count how many chunks are in each year save this in a new dataframe
year_chunk_counts = selected_chunks_df['year'].value_counts().reset_index()
year_chunk_counts.columns = ['year', 'chunk_count']

year_chunk_counts.describe()

# We would expect 95% to be within 2 standard deviations of the mean
    # That means between 8 and 28 chunks per year
    # We have just two outside of that, which is expected with 55 years of data

Unnamed: 0,year,chunk_count
count,55,55.0
mean,1996-12-31 14:50:10.909090944,18.181818
min,1970-01-01 00:00:00,6.0
25%,1983-07-02 12:00:00,14.5
50%,1997-01-01 00:00:00,18.0
75%,2010-07-02 12:00:00,22.0
max,2024-01-01 00:00:00,29.0
std,,5.117475


In [18]:
# Count how many chunks are in each district
district_chunk_counts = selected_chunks_df['district'].value_counts().reset_index()
district_chunk_counts.columns = ['district', 'chunk_count']

district_chunk_counts.describe()

Unnamed: 0,chunk_count
count,14.0
mean,71.428571
std,24.196994
min,1.0
25%,71.0
50%,77.5
75%,83.0
max,99.0


In [19]:
district_chunk_counts

Unnamed: 0,district,chunk_count
0,ns,99
1,bo,92
2,ph,88
3,cl,84
4,kc,80
5,at,79
6,mn,79
7,da,76
8,ny,73
9,ch,71
