In [5]:
import os
import pandas as pd
import re
from datetime import datetime
from symspellpy import SymSpell, Verbosity
import requests

# Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load a frequency dictionary
dictionary_path = "frequency_dictionary_en_82_765.txt"

# Download the dictionary if not already present
if not os.path.exists(dictionary_path):
    url = "https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt"
    response = requests.get(url)
    with open(dictionary_path, "wb") as f:
        f.write(response.content)

# Load the dictionary
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# Folder containing TXT files
folder_path = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"

# List of known certificate types (helps in typo correction)
certificate_types = [
    "Certificate of Incorporation",
    "Restated Certificate of Incorporation",
    "Certificate of Amendment",
    "Certificate of Merger",
    "Certificate of Conversion",
    "Certificate of Cancellation",
    "Amended and Restated Certificate of Incorporation",
    "Articles of Incorporation",
    "Amended and Restated Articles of Incorporation",
    "Certificate of Correction",
]

def correct_spelling(text):
    """Use SymSpell to correct spelling errors in the given text."""
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    return suggestions[0].term if suggestions else text

def preprocess_text(text):
    """Preprocess text: lowercasing, typo correction, and normalization."""
    text = text.lower()  # Standardize case
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    text = correct_spelling(text)  # Correct spelling
    return text

def extract_info_from_text(file_path):
    """Extract company name, date, and certificate type after preprocessing."""
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Preprocess the text
    cleaned_text = preprocess_text(text)

    # Extract company name
    company_match = re.search(r"company[:\s]+(.+)", cleaned_text)
    company_name = company_match.group(1).strip() if company_match else os.path.basename(file_path).split(".")[0]

    # Extract date (handling various formats)
    date_match = re.search(r"(\w+\s\d{1,2},\s\d{4}|\d{1,2}/\d{1,2}/\d{4})", cleaned_text)
    date_str = date_match.group(0) if date_match else "01/01/1900"  # Default placeholder
    date_obj = datetime.strptime(date_str, "%B %d, %Y") if "," in date_str else datetime.strptime(date_str, "%m/%d/%Y")

    # Identify closest matching certificate type
    detected_certificate = "Unknown Certificate"
    for cert_type in certificate_types:
        if any(word in cleaned_text for word in cert_type.lower().split()):
            detected_certificate = cert_type
            break

    return company_name, date_obj, detected_certificate

# Process each file and collect structured data
data = []
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if file_name.endswith(".txt"):
        company, date, cert_type = extract_info_from_text(file_path)
        data.append({"Company": company, "Date": date, "Certificate Type": cert_type})

# Create DataFrame
df = pd.DataFrame(data)

# Pivot to structure data with certificates as columns
df["Value"] = 1  # Indicator for certificate presence
df_pivot = df.pivot_table(index=["Company", "Date"], columns="Certificate Type", values="Value", fill_value=0)

# Sort by company and date
df_pivot = df_pivot.sort_values(by=["Company", "Date"], ascending=[True, True])

# Display the structured DataFra


ModuleNotFoundError: No module named 'symspellpy'