In [3]:
# Notebook 2: Data Cleaning

import pandas as pd
import os

# Define directories
RAW_DATA_PATH = "data/raw"
CLEANED_DATA_PATH = "data/cleaned"

# Create cleaned data directory if it doesn't exist
os.makedirs(CLEANED_DATA_PATH, exist_ok=True)

# Companies and financial statements
companies = ["AAPL", "JPM", "JNJ"]
statements = ["income_statement", "balance_sheet", "cash_flow"]

def clean_and_save_csv(company, statement):
    raw_file = os.path.join(RAW_DATA_PATH, f"{company}_{statement}.csv")
    cleaned_file = os.path.join(CLEANED_DATA_PATH, f"{company}_{statement}_clean.csv")

    try:
        df = pd.read_csv(raw_file, index_col=0)
        df.index = pd.to_datetime(df.index, errors='coerce')
        df = df.sort_index()
        df = df.dropna(axis=1, how='all')  # Remove columns that are entirely NaN
        df = df.fillna(0)
        df.to_csv(cleaned_file)
        print(f"Cleaned and saved: {cleaned_file}")
    except Exception as e:
        print(f"Failed to clean {raw_file}: {e}")

for company in companies:
    for statement in statements:
        clean_and_save_csv(company, statement)



Cleaned and saved: data/cleaned\AAPL_income_statement_clean.csv
Cleaned and saved: data/cleaned\AAPL_balance_sheet_clean.csv
Cleaned and saved: data/cleaned\AAPL_cash_flow_clean.csv
Cleaned and saved: data/cleaned\JPM_income_statement_clean.csv
Cleaned and saved: data/cleaned\JPM_balance_sheet_clean.csv
Cleaned and saved: data/cleaned\JPM_cash_flow_clean.csv
Cleaned and saved: data/cleaned\JNJ_income_statement_clean.csv
Cleaned and saved: data/cleaned\JNJ_balance_sheet_clean.csv
Cleaned and saved: data/cleaned\JNJ_cash_flow_clean.csv
