# Time Series Pipeline

## Basic Information Extraction:
- Goal: Automate excel file creation

In [13]:
# import packages
import os
import re
import spacy
import pandas as pd
from datetime import datetime

In [14]:
# Load SpaCy's English model
nlp = spacy.load("en_core_web_sm")

# Define directory containing text files
txt_folder_path = "/Users/alexchen/Downloads/Projects/vc-research/Batch1_text_readable"

# Regex pattern for extracting company names ending with "Inc."
company_name_regex = r"\b[A-Z][A-Za-z0-9&\-,\s]+Inc\.\b"

# Regex patterns for extracting dates
date_patterns = [
    r"\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b",  # August 10, 2020
    r"\b\d{1,2}/\d{1,2}/\d{4}\b",  # 08/10/2020 or 8/10/2020
    r"\b\d{1,2}-\d{1,2}-\d{4}\b",  # 08-10-2020
    r"\b\d{4}-\d{2}-\d{2}\b"  # 2020-08-10
]

# Context phrases that usually introduce dates
date_context_phrases = ["Filed on", "Dated", "Effective as of", "Executed on", "Signed this"]

In [15]:
# Extraction functions
def extract_company_name(text):
    """Extracts the company name with the suffix 'Inc.' only."""
    # First, use SpaCy's NER to identify organization entities
    doc = nlp(text)
    company_name = None
    for ent in doc.ents:
        if ent.label_ == "ORG":
            # Check if the entity ends with "Inc."
            name = ent.text.strip()
            if name.endswith("Inc."):
                company_name = name
                break

    # If SpaCy doesn't find a valid match, fallback to regex extraction
    if not company_name:
        matches = re.findall(company_name_regex, text)
        if matches:
            # Ensure we capture only "Inc." ending names
            company_name = matches[0]

    # If still no valid match, return "N/A"
    if not company_name:
        return "N/A"

    # Ensure company name is cleaned up and not mistakenly capturing non-company text
    company_name = company_name.strip()
    if "the corporation" in company_name.lower() or "this corporation" in company_name.lower():
        return "N/A"  # Filter out irrelevant entries
    
    return company_name

def extract_date(text):
    """Extracts the document date and ensures proper datetime formatting."""
    for phrase in date_context_phrases:
        match = re.search(rf"{phrase} (.*?)(?=[\n,])", text, re.IGNORECASE)
        if match:
            extracted_date = match.group(1).strip()
            for pattern in date_patterns:
                date_match = re.search(pattern, extracted_date)
                if date_match:
                    return format_date(date_match.group())

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return format_date(match.group())

    return pd.NaT  # Handle missing values properly

def format_date(date_str):
    """Converts date string into standardized datetime format."""
    date_formats = [
        "%B %d, %Y",  # August 10, 2020
        "%m/%d/%Y",   # 08/10/2020
        "%d-%m-%Y",   # 10-08-2020
        "%Y-%m-%d"    # 2020-08-10
    ]
    
    for fmt in date_formats:
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue

    return pd.NaT  # If no format matches, return NaT

def extract_certificate_type(text):
    """Identifies the certificate type from the full list of known types, prioritizing longer types."""
    # Expanded list of certificate types
    certificate_types = [
        "Certificate of Incorporation",
        "Restated Certificate of Incorporation",
        "Certificate of Amendment",
        "Certificate of Merger",
        "Certificate of Conversion",
        "Certificate of Cancellation",
        "Amended and Restated Certificate of Incorporation",
        "Articles of Incorporation",
        "Amended and Restated Articles of Incorporation",
        "Certificate of Correction"
    ]
    
    # Sort the certificate types in descending order of length (longer types first)
    certificate_types_sorted = sorted(certificate_types, key=len, reverse=True)
    
    # Loop through all certificate types (longest first) and check if they appear in the text
    for cert in certificate_types_sorted:
        if re.search(rf"\b{re.escape(cert)}\b", text, re.IGNORECASE):
            return cert

    return "N/A"

In [16]:
# Processing functions
def process_text_file(file_path):
    """Reads text file and extracts required details."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()

        company_name = extract_company_name(text)
        doc_date = extract_date(text)
        cert_type = extract_certificate_type(text)
        file_name = os.path.basename(file_path)

        # Debugging logs
        print(f"✅ Processed: {file_name}")
        print(f"   ➤ Company Name: {repr(company_name)}")
        print(f"   ➤ Date: {doc_date.strftime('%Y-%m-%d') if pd.notna(doc_date) else 'N/A'}")
        print(f"   ➤ Document Type: {repr(cert_type)}\n")

        return {
            "Company Name": company_name,
            "Date": doc_date,
            "File Name": file_name,
            "Document Type": cert_type
        }

    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return None

def process_text_files_in_directory(directory_path):
    """Processes all text files sequentially for reliability."""
    file_paths = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith(".txt")]

    all_results = []
    for file_path in file_paths:
        result = process_text_file(file_path)
        if result:
            all_results.append(result)

    # Convert to DataFrame
    extracted_df = pd.DataFrame(all_results)

    # Ensure datetime format for the Date column and replace NaT with "N/A"
    extracted_df["Date"] = pd.to_datetime(extracted_df["Date"], errors="coerce")
    extracted_df["Date"] = extracted_df["Date"].apply(lambda x: x.strftime('%Y-%m-%d') if pd.notna(x) else "N/A")

    return extracted_df

In [17]:
# Process all text files and get final DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

final_dataframe = process_text_files_in_directory(txt_folder_path)
final_dataframe

✅ Processed: 45_2008-01-17_Certificates of Incorporation.txt
   ➤ Company Name: 'A la Mobile, Inc.'
   ➤ Date: 2005-05-17
   ➤ Document Type: 'Amended and Restated Certificate of Incorporation'

✅ Processed: 16_2015-04-22_Certificates of Incorporation.txt
   ➤ Company Name: 'N/A'
   ➤ Date: 2012-12-17
   ➤ Document Type: 'Certificate of Cancellation'

✅ Processed: 28_2009-12-17_Certificates of Incorporation.txt
   ➤ Company Name: 'Parameter, Inc.'
   ➤ Date: 2004-04-22
   ➤ Document Type: 'Amended and Restated Certificate of Incorporation'

✅ Processed: 34_2010-01-28_Certificates of Incorporation.txt
   ➤ Company Name: '4Home, Inc.'
   ➤ Date: 2006-02-08
   ➤ Document Type: 'Certificate of Incorporation'

✅ Processed: 27_2006-08-23_Certificates of Incorporation.txt
   ➤ Company Name: '3VR Security, Inc.'
   ➤ Date: 2006-08-23
   ➤ Document Type: 'Amended and Restated Articles of Incorporation'

✅ Processed: 81_2006-07-28_Certificates of Incorporation.txt
   ➤ Company Name: 'Acceleron P

Unnamed: 0,Company Name,Date,File Name,Document Type
0,"A la Mobile, Inc.",2005-05-17,45_2008-01-17_Certificates of Incorporation.txt,Amended and Restated Certificate of Incorporation
1,,2012-12-17,16_2015-04-22_Certificates of Incorporation.txt,Certificate of Cancellation
2,"Parameter, Inc.",2004-04-22,28_2009-12-17_Certificates of Incorporation.txt,Amended and Restated Certificate of Incorporation
3,"4Home, Inc.",2006-02-08,34_2010-01-28_Certificates of Incorporation.txt,Certificate of Incorporation
4,"3VR Security, Inc.",2006-08-23,27_2006-08-23_Certificates of Incorporation.txt,Amended and Restated Articles of Incorporation
5,Acceleron Pharma Inc.,2003-06-13,81_2006-07-28_Certificates of Incorporation.txt,Amended and Restated Certificate of Incorporation
6,"Networks, Inc.",2014-06-27,48_2013-12-06_Certificates of Incorporation.txt,Amended and Restated Articles of Incorporation
7,"3VR Security, Inc.",,27_2005-12-22_Certificates of Incorporation.txt,Amended and Restated Certificate of Incorporation
8,"ALO Networks, Inc.",2013-09-30,48_2013-06-27_Certificates of Incorporation.txt,Articles of Incorporation
9,,,48_2005-06-30_Certificates of Incorporation.txt,Articles of Incorporation


In [18]:
# create multi-index dataframe ordering timesreies data
df = final_dataframe.set_index(["Company Name", "Date"]).sort_index()

# sort by date ascending for each company
df = df.groupby(level=0, sort=False).apply(
    lambda x: x.sort_index(level=1)
)
df.index = df.index.droplevel(0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,File Name,Document Type
Company Name,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
"3POINTS, Inc.",,24_2004-12-01_Certificates of Incorporation.txt,Certificate of Incorporation
"3VR Security, Inc.",2006-08-23,27_2006-08-23_Certificates of Incorporation.txt,Amended and Restated Articles of Incorporation
"3VR Security, Inc.",,27_2005-12-22_Certificates of Incorporation.txt,Amended and Restated Certificate of Incorporation
"3VR Security, Inc.",,27_2006-08-30_Certificates of Incorporation.txt,Amended and Restated Certificate of Incorporation
"3VR Security, Inc.",,27_2009-05-15_Certificates of Incorporation.txt,Amended and Restated Articles of Incorporation
"3VR Security, Inc.",,27_2013-09-26_Certificates of Incorporation.txt,Amended and Restated Articles of Incorporation
"3VR Security, Inc.",,27_2010-09-16_Certificates of Incorporation.txt,Articles of Incorporation
"3VR Security, Inc.",,27_2010-10-10_Certificates of Incorporation.txt,Amended and Restated Articles of Incorporation
"3VR Security, Inc.",,27_2008-07-31_Certificates of Incorporation.txt,Amended and Restated Articles of Incorporation
"3jam, Inc.",2006-04-03,21_2006-04-21_Certificates of Incorporation.txt,Amended and Restated Certificate of Incorporation
