In [14]:
# exploratory_cash_flow_1sa.ipynb

import os
import pandas as pd

# Define the directory containing the data
data_dir = '../data/entities'

# Ensure the data directory exists
os.makedirs(data_dir, exist_ok=True)

# Initialize a dictionary to hold unique first columns for 1-SA Cash Flow Statements
unique_cash_flow_columns = {}
entity_usage = {}

# Function to add unique columns to the dictionary
def add_unique_columns(entity_name, statement_type, unique_values):
    if statement_type not in unique_cash_flow_columns:
        unique_cash_flow_columns[statement_type] = set()
    unique_cash_flow_columns[statement_type].update(unique_values)
    
    for value in unique_values:
        if value not in entity_usage:
            entity_usage[value] = set()
        entity_usage[value].add(entity_name)

# Walk through the data directory to find the relevant files
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith('.csv'):
            entity_name = os.path.basename(os.path.dirname(os.path.dirname(root)))
            form_type = os.path.basename(os.path.dirname(root))
            statement_type = file.replace('.csv', '').replace('_', ' ').title()

            # Focus on 1-SA forms and Cash Flow Statements
            if form_type == "1-SA" and "Cash Flow" in statement_type:

                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)

                # Extract unique values from the first column
                unique_values = df.iloc[:, 0].dropna().unique()
                add_unique_columns(entity_name, statement_type, unique_values)

# Convert sets to lists for better readability and create DataFrames
unique_cash_flow_df_list = []
for statement_type, values in unique_cash_flow_columns.items():
    unique_cash_flow_df_list.append(pd.DataFrame(list(values), columns=[statement_type]))

# Save the unique columns data to DataFrame
unique_cash_flow_df = pd.concat(unique_cash_flow_df_list, axis=1)
output_file_path = '../data/unique_cash_flow_columns_1sa.csv'
unique_cash_flow_df.to_csv(output_file_path, index=False)

print(f"Unique first columns for Cash Flow Statements in 1-SA have been saved to '{output_file_path}'")

# Create DataFrame to show which entities used each unique cash flow statement line in their 1-SAs
entity_usage_df_list = []
for value, entities in entity_usage.items():
    entity_usage_df_list.append({"Cash Flow Line": value, "Entities": ', '.join(sorted(entities))})

entity_usage_df = pd.DataFrame(entity_usage_df_list)

output_usage_file_path = '../data/cash_flow_line_entity_usage_1sa.csv'
entity_usage_df.to_csv(output_usage_file_path, index=False)

print(f"Entity usage of unique cash flow statement lines in 1-SA have been saved to '{output_usage_file_path}'")

# Create DataFrame to show the count of entities using each unique cash flow statement line
line_usage_count_df_list = []
for value, entities in entity_usage.items():
    line_usage_count_df_list.append({"Cash Flow Line": value, "Entity Count": len(entities)})

line_usage_count_df = pd.DataFrame(line_usage_count_df_list)

output_count_file_path = '../data/cash_flow_line_usage_count_1sa.csv'
line_usage_count_df.to_csv(output_count_file_path, index=False)

print(f"Count of entities using each cash flow statement line in 1-SA have been saved to '{output_count_file_path}'")


Unique first columns for Cash Flow Statements in 1-SA have been saved to '../data/unique_cash_flow_columns_1sa.csv'
Entity usage of unique cash flow statement lines in 1-SA have been saved to '../data/cash_flow_line_entity_usage_1sa.csv'
Count of entities using each cash flow statement line in 1-SA have been saved to '../data/cash_flow_line_usage_count_1sa.csv'
