Calculate the amount of words in the text

In [6]:
import os
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize

# Ensure the NLTK tokenizer is downloaded
nltk.download('punkt')

def calculate_folder_word_count_and_std(base_path):
    stats = []

    # Iterate over each subfolder in the base directory
    for folder in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder)
        if os.path.isdir(folder_path):
            word_counts = []
            
            # Process each file within the subfolder
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                if os.path.isfile(file_path):
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                        tokens = word_tokenize(text)
                        word_counts.append(len(tokens))
            
            if word_counts:
                # Calculate total word count and standard deviation for the current folder
                total_words = sum(word_counts)
                std_dev = pd.Series(word_counts).std()
                stats.append({'Year': folder, 'Total Word Count': total_words, 'Standard Deviation': std_dev})

    return stats

def main():
    base_path = r"D:\Judicial_corpus"  
    results = calculate_folder_word_count_and_std(base_path)
    
    if results:
        # Create a DataFrame
        df = pd.DataFrame(results)
        
        # Save to CSV
        csv_path = os.path.join(base_path, 'yearly_word_counts_and_std.csv')
        df.to_csv(csv_path, index=False)
        print(f"Data saved to {csv_path}")
    else:
        print("No subfolders found or no word counts to process.")

if __name__ == '__main__':
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yiwen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Data saved to D:\Judicial_corpus\yearly_word_counts_and_std.csv


Search for attitudinal adverb and calculate the raw and normalized frequency

In [9]:

import csv
from collections import defaultdict, Counter

# List of adverbs to search
adverbs = ["appropriately", "correctly", "fortunately", "importantly", "improperly", "properly", "unfortunately"]

def count_words_and_adverbs(text, adverb_list):
    word_count = 0
    adverb_count = Counter()
    words = text.split()
    word_count += len(words)
    for word in words:
        if word.lower() in adverb_list:
            adverb_count[word.lower()] += 1
    return word_count, adverb_count

def process_directory(directory, adverb_list):
    total_word_count = 0
    total_adverb_counts = Counter()
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                word_count, adverb_counts = count_words_and_adverbs(text, adverb_list)
                total_word_count += word_count
                total_adverb_counts.update(adverb_counts)
    return total_word_count, total_adverb_counts

def main(base_directory, adverb_list):
    results = defaultdict(dict)
    # Collect data for each year
    for year in sorted(os.listdir(base_directory)):
        year_dir = os.path.join(base_directory, year)
        if os.path.isdir(year_dir):
            print(f"Processing {year}...")
            word_count, adverb_counts = process_directory(year_dir, adverb_list)
            for adverb in adverb_list:
                raw_freq = adverb_counts[adverb]
                norm_freq = (raw_freq / word_count) * 1_000_000
                results[year][f"{adverb} Raw Frequency"] = raw_freq
                results[year][f"{adverb} Normalized Frequency"] = norm_freq
    
    # Write to a single CSV file
    with open("adverbs_over_years.csv", 'w', newline='') as csvfile:
        fieldnames = ['Year'] + [f"{adverb} {metric}" for adverb in adverb_list for metric in ["Raw Frequency", "Normalized Frequency"]]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for year in sorted(results):
            row = {'Year': year}
            row.update(results[year])
            writer.writerow(row)




# call the function
main(r"D:\Judicial_corpus", adverbs)


Processing 2006...
Processing 2007...
Processing 2008...
Processing 2009...
Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...
Processing 2016...
Processing 2017...
Processing 2018...
Processing 2019...
Processing 2020...
Processing 2021...


Search for stance adverbs of emphasis and calculate hte raw and normalized frequency

In [11]:
# List of adverbs to search
adverbs = ["actually", "certainly", "clearly", "especially", "fully", 
           "highly", "indeed", "merely", "particularly", "plainly", 
           "precisely", "readily", "simply", "surely"]

def count_words_and_adverbs(text, adverb_list):
    word_count = 0
    adverb_count = Counter()
    words = text.split()
    word_count += len(words)
    for word in words:
        if word.lower() in adverb_list:
            adverb_count[word.lower()] += 1
    return word_count, adverb_count

def process_directory(directory, adverb_list):
    total_word_count = 0
    total_adverb_counts = Counter()
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                word_count, adverb_counts = count_words_and_adverbs(text, adverb_list)
                total_word_count += word_count
                total_adverb_counts.update(adverb_counts)
    return total_word_count, total_adverb_counts

def main(base_directory, adverb_list):
    results = defaultdict(dict)
    # Collect data for each year
    for year in sorted(os.listdir(base_directory)):
        year_dir = os.path.join(base_directory, year)
        if os.path.isdir(year_dir):
            print(f"Processing {year}...")
            word_count, adverb_counts = process_directory(year_dir, adverb_list)
            for adverb in adverb_list:
                raw_freq = adverb_counts[adverb]
                norm_freq = (raw_freq / word_count) * 1_000_000
                results[year][f"{adverb} Raw Frequency"] = raw_freq
                results[year][f"{adverb} Normalized Frequency"] = norm_freq
    
    # Write to a single CSV file
    with open("emphasis_adverbs_over_years.csv", 'w', newline='') as csvfile:
        fieldnames = ['Year'] + [f"{adverb} {metric}" for adverb in adverb_list for metric in ["Raw Frequency", "Normalized Frequency"]]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for year in sorted(results):
            row = {'Year': year}
            row.update(results[year])
            writer.writerow(row)




# call the function
main(r"D:\Judicial_corpus", adverbs)


Processing 2006...
Processing 2007...
Processing 2008...
Processing 2009...
Processing 2010...
Processing 2011...
Processing 2012...
Processing 2013...
Processing 2014...
Processing 2015...
Processing 2016...
Processing 2017...
Processing 2018...
Processing 2019...
Processing 2020...
Processing 2021...
