In [9]:
import os
import re
import pandas as pd
from datetime import datetime

# Path to the folder containing text files
txt_folder_path = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"

# List of known certificate types
certificate_types = [
    "Certificate of Incorporation",
    "Restated Certificate of Incorporation",
    "Certificate of Amendment",
    "Certificate of Merger",
    "Certificate of Conversion",
    "Certificate of Cancellation",
    "Amended and Restated Certificate of Incorporation",
    "Articles of Incorporation",
    "Amended and Restated Articles of Incorporation",
    "Certificate of Correction",
]

# Enhanced company name extraction function
def extract_company_name(text):
    """
    Extracts company name while handling different document structures
    and filtering out misleading phrases.
    """
    company_patterns = [
        r"the corporation is\s+([A-Z0-9\- &,.]+(?:INC\.|CORP\.|LLC|LTD\.|CO\.|GMBH|S\.A\.))",
        r"the company is\s+([A-Z0-9\- &,.]+(?:INC\.|CORP\.|LLC|LTD\.|CO\.|GMBH|S\.A\.))",
        r"this company is\s+([A-Z0-9\- &,.]+(?:INC\.|CORP\.|LLC|LTD\.|CO\.|GMBH|S\.A\.))",
        r"the name of the corporation is\s+([A-Z0-9\- &,.]+(?:INC\.|CORP\.|LLC|LTD\.|CO\.|GMBH|S\.A\.))",
        r"CERTIFICATE OF [A-Z ]+ OF\s+([A-Z0-9\- &,.]+(?:INC\.|CORP\.|LLC|LTD\.|CO\.|GMBH|S\.A\.))",
    ]
    
    for pattern in company_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    
    return "Unknown"

# Function to extract date
def extract_date(text):
    date_match = re.search(r"(\w+ \d{1,2}, \d{4})", text)  # Example: "January 1, 2020"
    if date_match:
        try:
            return datetime.strptime(date_match.group(1), "%B %d, %Y").date()
        except ValueError:
            return None
    return None

# Function to extract certificate type as a single string
def extract_certificate_type(text):
    for cert in certificate_types:
        if cert in text:
            return cert
    return "Unknown"

# Store extracted data
data = []

# Read and process each text file
for file_name in os.listdir(txt_folder_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(txt_folder_path, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

            company_name = extract_company_name(text)
            doc_date = extract_date(text)
            cert_type = extract_certificate_type(text)

            data.append((company_name, doc_date, cert_type))

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Company Name", "Date", "Certificate Type"])

# Sort DataFrame by Company Name and Date
df = df.sort_values(by=["Company Name", "Date"])

# Display the final DataFrame
df


Unnamed: 0,Company Name,Date,Certificate Type
82,"3-D Marketing Technologies, Inc.",2003-07-03,Certificate of Incorporation
41,"3POINTS, Inc.",,Certificate of Incorporation
23,"3point5, Inc.",2004-12-01,Certificate of Incorporation
53,"3point5, Inc.",2004-12-01,Certificate of Incorporation
76,"3point5, Inc.",2004-12-01,Certificate of Incorporation
...,...,...,...
66,Unknown,,Certificate of Incorporation
71,Unknown,,Articles of Incorporation
75,Unknown,,Articles of Incorporation
79,Unknown,,Articles of Incorporation
