# Import Libraries

In [324]:
import pandas as pd
import numpy as np

# Data Treatment

In [325]:
# We create the financials for the raw data obtained in notebook data_collection.ipynb

financials = pd.read_pickle('../data/raw_financials.pkl')
financials.sort_values(['Ticker', 'Date'], inplace=True)

In [326]:
cols_to_drop = [

    # =====================================================
    # Highly granular physical assets (low predictive power,
    # highly correlated with aggregate asset metrics)
    # =====================================================
    "Land And Improvements",
    "Buildings And Improvements",
    "Machinery Furniture Equipment",
    "Construction In Progress",
    "Work In Process",
    "Properties",
    "Other Properties",

    # =====================================================
    # Highly granular inventory details
    # =====================================================
    "Finished Goods",
    "Raw Materials",
    "Other Inventories",
    "Inventories Adjustments Allowances",

    # =====================================================
    # Highly granular receivables / payables
    # =====================================================
    "Gross Accounts Receivable",
    "Allowance For Doubtful Accounts Receivable",
    "Receivables Adjustments Allowances",
    "Provision For Doubtful Accounts",
    "Notes Receivable",
    "Non Current Note Receivables",

    # =====================================================
    # Duplicated / redundant Net Income definitions
    # Keep only: "Net Income From Continuing Operations"
    # =====================================================
    "Net Income",
    "Net Income Common Stockholders",
    "Net Income Continuous Operations",
    "Net Income From Continuing Operation Net Minority Interest",
    "Net Income From Continuing And Discontinued Operation",
    "Net Income Including Noncontrolling Interests",
    "Diluted NI Availto Com Stockholders",

    # =====================================================
    # Earnings Per Share (EPS)
    # Keep only: "Diluted EPS"
    # =====================================================
    "Basic EPS",

    # =====================================================
    # Redundant cash flow aggregates
    # Keep only: "Operating Cash Flow" and "Free Cash Flow"
    # =====================================================
    "Cash Flow From Continuing Operating Activities",
    "Financing Cash Flow",
    "Cash Flow From Continuing Financing Activities",
    "Investing Cash Flow",
    "Cash Flow From Continuing Investing Activities",
    "Changes In Cash",
    "Beginning Cash Position",
    "End Cash Position",
    "Cash Cash Equivalents And Short Term Investments",
    "Cash And Cash Equivalents",
    "Repurchase Of Capital Stock",
    "Repayment Of Debt",
    "Issuance Of Debt",
    "Interest Paid Supplemental Data",
    "Income Tax Paid Supplemental Data",
    "Effect Of Exchange Rate Changes",
    "Net Other Financing Charges",
    "Proceeds From Stock Option Exercised",
    "Cash Dividends Paid",
    "Common Stock Dividend Paid",
    "Net Common Stock Issuance",
    "Common Stock Payments",
    "Net Issuance Payments Of Debt",
    "Net Long Term Debt Issuance",
    "Long Term Debt Payments",
    "Long Term Debt Issuance",
    "Net Other Investing Changes",
    "Net Investment Purchase And Sale",
    "Sale Of Investment",
    "Purchase Of Investment",
    "Net Business Purchase And Sale",
    "Purchase Of Business",
    "Net PPE Purchase And Sale",
    "Purchase Of PPE",
    "Other Non Cash Items",

    # =====================================================
    # Depreciation / amortization highly redundant metrics
    # Keep only: "Depreciation And Amortization"
    # =====================================================
    "Normalized EBITDA",
    "Depreciation",
    "Reconciled Depreciation",
    "Depreciation Income Statement",
    "Depreciation Amortization Depletion",
    "Depreciation Amortization Depletion Income Statement",
    "Depreciation And Amortization In Income Statement",
    "Amortization",
    "Amortization Cash Flow",
    "Amortization Of Intangibles",
    "Amortization Of Intangibles Income Statement",
    "Depletion",
    "Depletion Income Statement",

    # =====================================================
    # Overly detailed equity structure components
    # Keep only aggregate equity metrics
    # =====================================================
    "Capital Stock",
    "Common Stock",
    "Preferred Stock",
    "Additional Paid In Capital",
    "Other Equity Adjustments",
    "Gains Losses Not Affecting Retained Earnings",
    "Treasury Stock",
    "Treasury Shares Number",
    "Minority Interest",
    "Minority Interests",
    "Common Stock Equity",
    "Other Non Current Liabilities",
    "Other Current Borrowings",
    "Employee Benefits",
    "Non Current Pension And Other Postretirement Benefit Plans",
    "Long Term Debt And Capital Lease Obligation",
    "Current Deferred Liabilities",
    "Current Deferred Revenue",
    "Pensionand Other Post Retirement Benefit Plans Current",
    "Payables And Accrued Expenses",
    "Current Accrued Expenses",
    "Payables",
    "Accounts Payable",
    "Other Non Current Assets",
    "Other Intangible Assets",
    "Other Current Assets",
    "Other Investments",
    "Other Payable",
    "Preferred Stock Dividends",
    "Preferred Shares Number",
    "Preferred Stock Equity",
    "Long Term Equity Investment",
    "Investments In Other Ventures Under Equity Method",
    "Prepaid Assets",
    "Issuance Of Capital Stock",
    "Preferred Stock Dividend Paid",
    "Common Stock Issuance",
    "Earnings Losses From Equity Investments",
    "Average Dilution Earnings",
    "Other Gand A",
    "Other Equity Interest",
    "Non Current Deferred Revenue",
    "Current Capital Lease Obligation",
    "Non Current Accounts Receivable",

    # =====================================================
    # Working capital micro-changes (very noisy, highly correlated)
    # Keep only: "Working Capital" and "Change In Working Capital"
    # =====================================================
    "Change In Payables And Accrued Expense",
    "Change In Payable",
    "Change In Account Payable",
    "Change In Inventory",
    "Change In Receivables",
    "Changes In Account Receivables",
    "Change In Other Working Capital",
    "Change In Other Current Assets",
    "Change In Other Current Liabilities",

    # =====================================================
    # One-off / non-recurring items (introduce noise)
    # =====================================================
    "Total Unusual Items",
    "Total Unusual Items Excluding Goodwill",
    "Other Income Expense",
    "Other Non Operating Income Expenses",
    "Operating Gains Losses",
    "Gain Loss On Investment Securities",
    "Unrealized Gain Loss On Investment Securities",
    "Gain On Sale Of Business",
    "Gain On Sale Of Ppe",
    "Sale Of Business",
    "Net Income Discontinuous Operations",
    "Net Income From Tax Loss Carryforward",
    "Net Income Extraordinary",
    "Provisionand Write Offof Assets",

    # =====================================================
    # Sector-specific insurance / financial instruments
    # (harm generalization across sectors)
    # =====================================================
    "Net Policyholder Benefits And Claims",
    "Policyholder Benefits Gross",
    "Policyholder Benefits Ceded",
    "Insurance And Claims",
    "Trading Securities",
    "Held To Maturity Securities",
    "Derivative Product Liabilities",

    # =====================================================
    # Overly detailed financing components
    # Keep only aggregate debt metrics
    # =====================================================
    "Commercial Paper",
    "Line Of Credit",
    "Current Notes Payable",
    "Short Term Debt Issuance",
    "Short Term Debt Payments",
    "Net Short Term Debt Issuance",

    # =====================================================
    # Micro / noisy financial metrics
    # =====================================================
    "Net Non Operating Interest Income Expense",
    "Interest Expense Non Operating",
    "Interest Income Non Operating",
    "Reconciled Cost Of Revenue",
    "Operating Revenue",
    "Ordinary Shares Number",
    "Share Issued",
    "Interest Payable",
    "Sale Of PPE",
    "Earnings From Equity Interest",
    "Hedging Assets Current",
    "Restricted Cash",
    "Taxes Receivable",
    "Capital Expenditure Reported",
    "Salaries And Wages",
    "Net Intangibles Purchase And Sale",
    "Purchase Of Intangibles",
    "Amortization Of Securities",
    "Pension And Employee Benefit Expense",
    "Total Other Finance Cost",
    "Other Taxes",
    "Long Term Provisions",
    "Defined Pension Benefit",
    "Preferred Securities Outside Stock Equity",
    "Financial Assets",
    "Assets Held For Sale Current",
    "Liabilities Heldfor Sale Non Current",
    "Investmentsin Associatesat Cost",
    "Other Cash Adjustment Inside ChangeinCash",
    "Net Preferred Stock Issuance",
    "Preferred Stock Issuance",
    "Net Foreign Currency Exchange Gain Loss",
    "Minimum Pension Liabilities",
    "Loans Receivable",
    "Other Cash Adjustment Outside Changein Cash",
    "Cash From Discontinued Financing Activities",
    "Preferred Stock Payments",
    "Cash From Discontinued Investing Activities",
    "Cash From Discontinued Operating Activities",
    "Current Deferred Assets",
    "Dueto Related Parties Current",
    "Investmentsin Joint Venturesat Cost",
    "Dividend Received Cfo",
    "Duefrom Related Parties Current",
    "Rent Expense Supplemental",
    "Rent And Landing Fees",
    "Non Current Prepaid Assets",
    "Non Current Accrued Expenses",
    "Investment Properties",
    "Net Investment Properties Purchase And Sale",
    "Sale Of Investment Properties",
    "Purchase Of Investment Properties",
    "Dividends Received Cfi",
    "Occupancy And Equipment",
    "Professional Expense And Contract Services Expense",
    "Other Non Interest Expense",
    "Cash Cash Equivalents And Federal Funds Sold",
    "Sale Of Intangibles",
    "Financial Assets Designatedas Fair Value Through Profitor Loss Total",
    "Change In Interest Payable",
    "Investmentsin Subsidiariesat Cost",
    "Excess Tax Benefit From Stock Based Compensation",
    "Current Deferred Taxes Assets",
    "Interest Paid Cff",
    "Excise Taxes",
    "Dividend Paid Cfo",
    "Cash Flowsfromusedin Operating Activities Direct",
    "Classesof Cash Payments",
    "Other Cash Paymentsfrom Operating Activities",
    "Paymentson Behalfof Employees",
    "Classesof Cash Receiptsfrom Operating Activities",
    "Other Cash Receiptsfrom Operating Activities",
    "Taxes Refund Paid",
    "Securities Amortization",
    "Duefrom Related Parties Non Current",
    "Dueto Related Parties Non Current",
    "Interest Paid Cfo"
]


In [327]:
# We drop the columns selected previously

financials = financials.drop(columns=cols_to_drop, errors='ignore')

In [328]:
#Lots of columns with too many nulls

prctg_of_nulls = (financials.isna().mean()).sort_values(ascending=False)
prctg_of_nulls[prctg_of_nulls>0.4]

Cash Flow From Discontinued Operation         0.994795
Other Cash Adjustment Inside Changein Cash    0.994489
Accrued Interest Receivable                   0.986222
Unrealized Gain Loss                          0.985609
Foreign Currency Translation Adjustments      0.977648
Loss Adjustment Expense                       0.966626
Cash Equivalents                              0.936926
Gain Loss On Sale Of PPE                      0.922229
Earnings From Equity Interest Net Of Tax      0.918249
Current Provisions                            0.915493
Leases                                        0.913043
Write Off                                     0.906307
Cash Financial                                0.906307
Impairment Of Capital Assets                  0.902021
Otherunder Preferred Stock Dividend           0.885181
Gain Loss On Sale Of Business                 0.884568
Investmentin Financial Assets                 0.875077
Dividends Payable                             0.870790
Available 

In [329]:
# list of critical columns that we want to keep
critical_cols = [
    'Ticker', 'Date', 'EBITDA', 'EBIT', 'Net Interest Income', 
    'Interest Expense', 'Interest Income', 'Normalized Income', 
    'Total Expenses', 'Total Operating Income As Reported',
    'Diluted Average Shares', 'Basic Average Shares', 'Diluted EPS',
    'Tax Provision', 'Pretax Income', 'Operating Income', 'Operating Expense',
    'Research And Development', 'Selling General And Administration',
    'Gross Profit', 'Cost Of Revenue', 'Total Revenue', 'Operating Revenue',
    'Ordinary Shares Number', 'Share Issued', 'Net Debt', 'Total Debt',
    'Tangible Book Value', 'Invested Capital', 'Working Capital', 
    'Net Tangible Assets', 'Common Stock Equity', 'Total Capitalization', 
    'Total Equity Gross Minority Interest', 'Stockholders Equity',
    'Retained Earnings', 'Total Liabilities Net Minority Interest', 
    'Total Non Current Liabilities Net Minority Interest',
    'Other Non Current Liabilities', 'Employee Benefits',
    'Non Current Pension And Other Postretirement Benefit Plans', 
    'Long Term Debt', 'Current Liabilities', 'Current Debt', 
    'Inventory', 'Accounts Receivable', 'Cash And Cash Equivalents',
    'Free Cash Flow', 'Operating Cash Flow', 'Depreciation And Amortization',
    'Net Income From Continuing Operations'
]

# We select columns with >60% nulls
null_percent = financials.isna().mean()
high_null_cols = null_percent[null_percent > 0.6].index.tolist()

# We drop those columns if not critical
cols_to_drop = [col for col in high_null_cols if col not in critical_cols]
financials = financials.drop(columns=cols_to_drop, errors='ignore')

print(f"Dropped {len(cols_to_drop)} columns with >60% nulls & not critical")
print("Dropped columns:", cols_to_drop)
print("Kept columns:", financials.columns.tolist())


Dropped 38 columns with >60% nulls & not critical
Dropped columns: ['Other Receivables', 'Asset Impairment Charge', 'Tradeand Other Payables Non Current', 'Other Current Liabilities', 'Total Tax Payable', 'Income Tax Payable', 'Non Current Deferred Assets', 'Non Current Deferred Taxes Assets', 'Investmentin Financial Assets', 'Available For Sale Securities', 'Leases', 'Other Short Term Investments', 'Cash Equivalents', 'Cash Financial', 'Otherunder Preferred Stock Dividend', 'Other Special Charges', 'Restructuring And Mergern Acquisition', 'Gain On Sale Of Security', 'Other Operating Expenses', 'Dividends Payable', 'Change In Prepaid Assets', 'Earnings From Equity Interest Net Of Tax', 'Loss Adjustment Expense', 'Selling And Marketing Expense', 'General And Administrative Expense', 'Change In Accrued Expense', 'Change In Tax Payable', 'Change In Income Tax Payable', 'Current Provisions', 'Gain Loss On Sale Of PPE', 'Write Off', 'Impairment Of Capital Assets', 'Other Cash Adjustment Ins

In [330]:
# We impute nulls in a gross manner just to have a clean dataset and create a benchmark prediction

fill_zero = [
    'EBITDA','EBIT','Net Interest Income','Interest Expense','Interest Income',
    'Normalized Income','Total Expenses','Operating Income','Operating Expense',
    'Gross Profit','Cost Of Revenue','Total Revenue',
    'Research And Development','Selling General And Administration',
    'Tax Provision','Pretax Income','Net Income From Continuing Operations',
    'Depreciation And Amortization','Stock Based Compensation',
    'Tax Effect Of Unusual Items','Special Income Charges',
    'Deferred Tax','Deferred Income Tax','Change In Working Capital',
    'Operating Cash Flow','Free Cash Flow','Capital Expenditure',  'Tax Rate For Calcs',
    'Total Operating Income As Reported'
]

ffill_cols = [
    'Net Debt','Total Debt','Long Term Debt','Current Debt',
    'Current Debt And Capital Lease Obligation',
    'Total Assets','Total Non Current Assets','Current Assets',
    'Inventory','Receivables','Accounts Receivable',
    'Net PPE','Gross PPE','Accumulated Depreciation',
    'Goodwill','Goodwill And Other Intangible Assets',
    'Investments And Advances','Working Capital',
    'Tangible Book Value','Net Tangible Assets','Invested Capital',
    'Total Capitalization','Stockholders Equity','Retained Earnings',
    'Total Liabilities Net Minority Interest',
    'Total Non Current Liabilities Net Minority Interest',
    'Non Current Deferred Liabilities',
    'Non Current Deferred Taxes Liabilities',
    'Capital Lease Obligations','Long Term Capital Lease Obligation',
    'Diluted Average Shares','Basic Average Shares','Diluted EPS', 
    'Current Liabilities', 'Total Equity Gross Minority Interest'
]

financials = financials.sort_values(['Ticker', 'Date'])

financials[fill_zero] = financials[fill_zero].fillna(0)
financials[ffill_cols] = financials.groupby('Ticker')[ffill_cols].ffill().bfill().fillna(0)

print(f'nulls in the dataset financials: {financials.isna().sum().sum()}')

nulls in the dataset financials: 0


# We save the data as a pickle in the folder 'data'

In [331]:
#We save the 'financials' DataFrame as a pickle with the name financials.pkl in the 'data' folder
#We save the data folder path or create it if it doesn't exist

project_path = Path('..')          
data_folder = project_path / 'data' 
data_folder.mkdir(exist_ok=True) 

#We save the DataFrame in pikle format
file_path = data_folder / 'financials.pkl'
financials.to_pickle(file_path)

print(f"DataFrame saved successfully in {file_path}")

DataFrame saved successfully in ../data/financials.pkl
