In [42]:
createdfile = r"D:\vc-research\vc-research\Reese's contributions\reese data\downloaded_batches\combined_extracted_dataAICoded.csv"
#createdfile = r"D:\vc-research\reese data\batches\batch2\downloads\combined_extracted_datafuck.csv"
#priorfile = r"D:\vc-research\VC Research (Batch 1) - Batch 1 Main.csv"

priorfile = r"D:\vc-research\VC Research (Batch 2) - Batch 2 Main.csv"


In [43]:
import pandas as pd
import re
import numpy as np

def clean_custom_id(value):
    """Strip 'checked_' prefix and trailing 8-digit hash from ID strings."""
    if isinstance(value, str):
        value = re.sub(r'^checked_', '', value)
        value = re.sub(r'_[a-f0-9]{8}$', '', value)
    return value

def strip_commas(val, na_dict=None):
    """
    Remove commas and whitespace from value, try converting to float.
    If the cleaned string is in na_dict, return the mapped value (e.g., 0.0, np.nan, etc.)
    """
    if pd.isna(val):
        return val

    val_str = str(val).replace(",", "").strip()

    # If value is in the mapping, return the mapped value
    if na_dict and val_str in na_dict:
        return na_dict[val_str]

    try:
        return float(val_str)
    except ValueError:
        return val_str


def compare_fields(file1, file2, field1, field2,
                   id_field1, id_field2,
                   file1_is_df=False, file2_is_df=False,
                   clean_key_field_in=None,
                   na_dict=None):
    """
    Compare two fields from two datasets using specified ID fields.
    Cleans fields in-place before merge. Compares cleaned values.
    """

    # Load files
    df1 = file1 if file1_is_df else pd.read_csv(file1)
    df2 = file2 if file2_is_df else pd.read_csv(file2)

    # Optional ID cleaning
    if clean_key_field_in == 'file1':
        df1[id_field1] = df1[id_field1].apply(clean_custom_id)
    elif clean_key_field_in == 'file2':
        df2[id_field2] = df2[id_field2].apply(clean_custom_id)

    # Apply cleaning to comparison fields BEFORE merge
    df1[field1] = df1[field1].apply(lambda x: strip_commas(x, na_dict=na_dict))
    df2[field2] = df2[field2].apply(lambda x: strip_commas(x, na_dict=na_dict))

    # Rename ID columns for merging
    df1 = df1.rename(columns={id_field1: '__merge_id__'})
    
    df2 = df2.rename(columns={id_field2: '__merge_id__'})
    
    # Merge
    merged = pd.merge(df1, df2, on='__merge_id__', suffixes=('_file1', '_file2'), how='inner')
    print("Merged columns:", merged.columns.tolist())

    # Determine correct column names
    f1_col = f"{field1}_file1" if f"{field1}_file1" in merged.columns else field1
    f2_col = f"{field2}_file2" if f"{field2}_file2" in merged.columns else field2

    if f1_col not in merged.columns:
        raise KeyError(f"'{f1_col}' not found in merged dataframe.")
    if f2_col not in merged.columns:
        raise KeyError(f"'{f2_col}' not found in merged dataframe.")

    # Compare cleaned values directly
    merged['match'] = (
    (merged[f1_col] == merged[f2_col]) |
    (pd.isna(merged[f1_col]) & pd.isna(merged[f2_col]))
)

    
    # Count number of True/False in match column
    print(merged['match'].value_counts())

    return merged[['__merge_id__', f1_col, f2_col, 'match']]




import pandas as pd

def report_mismatches(df,  id_field, model_field, human_field, match_field='match',text=""):
    """
    Prints a statement for each row where match_field == False.

    Parameters:
    - df: pandas DataFrame
    - id_field: column name containing the unique ID
    - model_field: column name for model-returned value
    - human_field: column name for human-coded value
    - match_field: column name for match indicator (default = 'match')
    """
    count = 0
    mismatches = df[df[match_field] == False]  # Filter mismatches
    
    for idx, row in mismatches.iterrows():
        print(f'[{idx}] {text}: "{row[id_field][:14]}" returned "{row[model_field]}", '
          f'but a human coded it as "{row[human_field]}"')





In [44]:

df1 = pd.read_csv(createdfile)
df2 = pd.read_csv(priorfile)

print("Created file columns:", df1.columns.tolist())
print("Prior file columns:", df2.columns.tolist())

df_comparison = compare_fields(
    file1=createdfile,
    file2=priorfile,
    field1="authorized_common_stock",           # from createdfile
    field2="Number of Common Stocks Issued",           # from priorfile
    id_field1="custom_id",           # ID column in createdfile
    id_field2="File Name",           # ID column in priorfile
    clean_key_field_in="file1",       # clean `custom_id` before merging
    na_dict = {    "Uncertain": np.nan,    "N/A": np.nan,    "": np.nan, "MISSING": np.nan}

)

print(df_comparison.head(10).to_string())
report_mismatches(df_comparison, 
    id_field='__merge_id__',model_field='authorized_common_stock',human_field="Number of Common Stocks Issued", text="Common Stock Issued")



Created file columns: ['custom_id', 'company_name', 'document_title', 'inc_document', 'filing_date', 'state_of_incorporation', 'authorized_common_stock', 'authorized_common_stock_d', 'authorized_preferred_stock', 'authorized_preferred_stock_d', 'preferred_stock_series', 'preferred_stock_series_d', 'total_shares_authorized', 'total_shares_authorized_d', 'multiple_common_stock_classes', 'common_voting_rights', 'common_issued_shares', 'preferred_stock_terms_Series A', 'liq_pref_flag', 'liquidation_priority_order', 'liquidation_priority_order_d', 'conversion_rights', 'conversion_rights_d', 'automatic_conversion_triggers', 'automatic_conversion_triggers_d', 'merger_indicator', 'merger_indicator_d', 'Name_change_flag', 'notable_features_summary', 'preferred_stock_terms', 'preferred_stock_terms_Series B', 'preferred_stock_terms_Series A1', 'preferred_stock_terms_Series B2', 'preferred_stock_terms_Series C', 'preferred_stock_terms_Series A-1', 'preferred_stock_terms_Series A-2', 'preferred_sto

In [45]:

df_comparison = compare_fields(
    file1=createdfile,
    file2=priorfile,
    field1="authorized_preferred_stock",           # from createdfile
    field2="Number of Preferred Stock Issued",           # from priorfile
    id_field1="custom_id",           # ID column in createdfile
    id_field2="File Name",       # clean `custom_id` before merging
    clean_key_field_in="file1",       # clean `custom_id` before merging
    na_dict = {    "Uncertain": np.nan,    "N/A": np.nan,    "": np.nan, "MISSING": np.nan}
    
    
    
)

report_mismatches(df_comparison, 
    id_field='__merge_id__',model_field='authorized_preferred_stock',human_field="Number of Preferred Stock Issued")



Merged columns: ['__merge_id__', 'company_name', 'document_title', 'inc_document', 'filing_date', 'state_of_incorporation', 'authorized_common_stock', 'authorized_common_stock_d', 'authorized_preferred_stock', 'authorized_preferred_stock_d', 'preferred_stock_series', 'preferred_stock_series_d', 'total_shares_authorized', 'total_shares_authorized_d', 'multiple_common_stock_classes', 'common_voting_rights', 'common_issued_shares', 'preferred_stock_terms_Series A', 'liq_pref_flag', 'liquidation_priority_order', 'liquidation_priority_order_d', 'conversion_rights', 'conversion_rights_d', 'automatic_conversion_triggers', 'automatic_conversion_triggers_d', 'merger_indicator', 'merger_indicator_d', 'Name_change_flag', 'notable_features_summary', 'preferred_stock_terms', 'preferred_stock_terms_Series B', 'preferred_stock_terms_Series A1', 'preferred_stock_terms_Series B2', 'preferred_stock_terms_Series C', 'preferred_stock_terms_Series A-1', 'preferred_stock_terms_Series A-2', 'preferred_stock_

In [46]:

df_comparison = compare_fields(
    file1=createdfile,
    file2=priorfile,
    field1="total_shares_authorized",           # from createdfile
    field2="Total Number of Stocks Issued",           # from priorfile
    id_field1="custom_id",           # ID column in createdfile
    id_field2="File Name",           # ID column in priorfile
    clean_key_field_in="file1",       # clean `custom_id` before merging
    na_dict = {    "Uncertain": 0.0,    "N/A": np.nan,    "": np.nan, "MISSING": np.nan}    
)



report_mismatches(df_comparison, 
    id_field='__merge_id__',model_field='total_shares_authorized',human_field="Total Number of Stocks Issued")

Merged columns: ['__merge_id__', 'company_name', 'document_title', 'inc_document', 'filing_date', 'state_of_incorporation', 'authorized_common_stock', 'authorized_common_stock_d', 'authorized_preferred_stock', 'authorized_preferred_stock_d', 'preferred_stock_series', 'preferred_stock_series_d', 'total_shares_authorized', 'total_shares_authorized_d', 'multiple_common_stock_classes', 'common_voting_rights', 'common_issued_shares', 'preferred_stock_terms_Series A', 'liq_pref_flag', 'liquidation_priority_order', 'liquidation_priority_order_d', 'conversion_rights', 'conversion_rights_d', 'automatic_conversion_triggers', 'automatic_conversion_triggers_d', 'merger_indicator', 'merger_indicator_d', 'Name_change_flag', 'notable_features_summary', 'preferred_stock_terms', 'preferred_stock_terms_Series B', 'preferred_stock_terms_Series A1', 'preferred_stock_terms_Series B2', 'preferred_stock_terms_Series C', 'preferred_stock_terms_Series A-1', 'preferred_stock_terms_Series A-2', 'preferred_stock_