In [1]:
omit_words = {"passu", "annum","exploramed", "issuances", "Facteau", "recapitalizations", "TECHNOLOGEES", "Laramore", "unvested", "pharma"}
dictionary_path = r"D:\vc-research\vc-research\frequency_dictionary_en_82_765.txt"
import os
import re
from symspellpy.symspellpy import SymSpell, Verbosity

# Initialize SymSpell
max_edit_distance = 2
prefix_length = 7
sym_spell = SymSpell(max_edit_distance, prefix_length)

# Load dictionary

term_index = 0
count_index = 1
sym_spell.load_dictionary(dictionary_path, term_index, count_index)

# Paths
input_folder = r"D:\vc-research\vc-research\Batch56_text_readable"
output_folder = r"D:\vc-research\vc-research\batch56_spellcheck"
os.makedirs(output_folder, exist_ok=True)

# Logs
correction_counts = []
corrections_list = []

# Words to skip (case-insensitive)


# Token pattern: words, punctuation, spaces, tabs, line breaks
token_pattern = re.compile(r"(\w+|\s+|[^\w\s])", re.UNICODE)

# Process each file
for filename in os.listdir(input_folder):
    if filename.lower().endswith(".txt"):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, f"checked_{filename}")

        with open(input_path, "r", encoding="utf-8") as infile:
            text = infile.read()

        tokens = token_pattern.findall(text)
        corrected_tokens = []
        correction_count = 0

        for token in tokens:
            if token.strip() == "" or not token.isalpha():
                corrected_tokens.append(token)
                continue

            token_lower = token.lower()

            # Skip correction for short words, omit list, or ALL CAPS
            if len(token) <= 4 or token_lower in omit_words or token.isupper():
                corrected_tokens.append(token)
                continue

            # Track casing
            original_case = (
                "upper" if token.isupper() else
                "title" if token.istitle() else
                "lower"
            )

            suggestions = sym_spell.lookup(token_lower, Verbosity.CLOSEST, max_edit_distance)
            if suggestions and suggestions[0].term != token_lower:
                corrected = suggestions[0].term
                correction_count += 1

                # Restore case
                if original_case == "upper":
                    corrected = corrected.upper()
                elif original_case == "title":
                    corrected = corrected.title()

                corrections_list.append(f"{filename} | {token} → {corrected}")
            else:
                corrected = token

            corrected_tokens.append(corrected)

        # Save corrected file
        with open(output_path, "w", encoding="utf-8") as outfile:
            outfile.write("".join(corrected_tokens))

        correction_counts.append(f"{filename}: {correction_count}")

# Save correction summary
with open(os.path.join(output_folder, "correctioncount.txt"), "w", encoding="utf-8") as summary_file:
    summary_file.write("\n".join(correction_counts))

# Save correction details
with open(os.path.join(output_folder, "correctionslist.txt"), "w", encoding="utf-8") as correction_file:
    correction_file.write("\n".join(corrections_list))

print("Spellcheck complete. Outputs saved.")


Spellcheck complete. Outputs saved.


In [2]:
import os
import tiktoken
from tqdm import tqdm
import pandas as pd

# Folder containing your .txt files
input_dir = r"D:\vc-research\vc-research\Batch56_text_readable"

# Choose the model you're planning to use
MODEL_NAME = "gpt-4"  # or "gpt-4o" or "gpt-3.5-turbo"

# Load tokenizer for selected model
encoding = tiktoken.encoding_for_model(MODEL_NAME)

# Results container
token_data = []

# Loop over .txt files and estimate tokens
for fname in tqdm(os.listdir(input_dir)):
    if not fname.endswith(".txt"):
        continue

    path = os.path.join(input_dir, fname)
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read()

    token_count = len(encoding.encode(text))
    token_data.append({"filename": fname, "tokens": token_count})

# Convert to DataFrame
df_tokens = pd.DataFrame(token_data)

# Summary stats
print(df_tokens.describe())
# Pricing (GPT-4o July 2025 rates)
input_price_per_1k = 0.005  # $5 per million tokens
output_price_per_1k = 0.015 # assumed 500-token output

df_tokens["estimated_output_tokens"] = 500
df_tokens["cost_usd"] = (df_tokens["tokens"] + df_tokens["estimated_output_tokens"]) / 1000 * (input_price_per_1k + output_price_per_1k)

# Save again with cost
df_tokens.to_csv("token_estimates_with_cost.csv", index=False)



100%|██████████| 15485/15485 [01:35<00:00, 162.25it/s]


              tokens
count   15485.000000
mean     9555.540652
std      7626.149323
min        22.000000
25%       864.000000
50%     11280.000000
75%     15536.000000
max    110403.000000
