In [18]:
"""
This module imputes data using KNN imputation.
Author: Anuvrat Chaturvedi
Date: 2024-06-09
Inspired by: https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/
"""

'\nThis module imputes data using KNN imputation.\nAuthor: Anuvrat Chaturvedi\nDate: 2024-06-09\nInspired by: https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/\n'

### Import libraries and define standard variables

In [63]:
# Importing the required libraries
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
# IterativeImputer explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
from sklearn.linear_model import BayesianRidge

In [51]:
# Define common variables
data_folder = '../data/'
seed=42

### Load the datasets

In [8]:
# Explore the pickled set of combined edgar data for the latest year and quarter
df_edgar_combined = pd.read_pickle(data_folder + 'df_edgar_combined.pkl')
df_edgar_combined.head()

Unnamed: 0,sym,IncomeTaxExpenseBenefit_CY2023,NetCashProvidedByUsedInFinancingActivities_CY2023,WeightedAverageNumberOfSharesOutstandingBasic_CY2023,WeightedAverageNumberOfDilutedSharesOutstanding_CY2023,NetCashProvidedByUsedInInvestingActivities_CY2023,NetCashProvidedByUsedInOperatingActivities_CY2023,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect,ComprehensiveIncomeNetOfTax_CY2023,NetIncomeLoss_CY2023,...,Assets,LiabilitiesAndStockholdersEquity,AccumulatedOtherComprehensiveIncomeLossNetOfTax,RetainedEarningsAccumulatedDeficit,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents,StockholdersEquity,Goodwill,AssetsCurrent,LiabilitiesCurrent,CashAndCashEquivalentsAtCarryingValue
0,A,99000000.0,-930000000.0,294000000.0,296000000.0,-310000000.0,1772000000.0,537000000.0,1260000000.0,1240000000.0,...,10856000000.0,10856000000.0,-337000000.0,1090000000.0,1674000000.0,6214000000.0,3963000000.0,4203000000.0,1958000000.0,1671000000.0
1,AAL,299000000.0,-3206000000.0,653612000.0,719669000.0,-502000000.0,3803000000.0,95000000.0,513000000.0,822000000.0,...,64384000000.0,64384000000.0,-4877000000.0,-8001000000.0,703000000.0,-5500000000.0,4091000000.0,14542000000.0,24993000000.0,
2,AAPL,16741000000.0,-108488000000.0,15744230000.0,15812550000.0,3705000000.0,110543000000.0,5760000000.0,96652000000.0,96995000000.0,...,337411000000.0,337411000000.0,-8960000000.0,4339000000.0,33921000000.0,74194000000.0,,128416000000.0,123822000000.0,32695000000.0
3,ABT,941000000.0,-7091000000.0,1740000000.0,1749000000.0,-3133000000.0,7261000000.0,-2986000000.0,5935000000.0,5723000000.0,...,72467000000.0,72467000000.0,-8166000000.0,38011000000.0,6284000000.0,38810000000.0,23383000000.0,22376000000.0,14021000000.0,6284000000.0
4,ACGL,-873000000.0,-69000000.0,368700000.0,378800000.0,-5468000000.0,5749000000.0,225000000.0,5413000000.0,4443000000.0,...,62768000000.0,62768000000.0,-821000000.0,21405000000.0,1600000000.0,19355000000.0,,,,993000000.0


In [33]:
# Keeping only numeric columns for imputation
df_edgar_combined_numeric = df_edgar_combined.select_dtypes(include='number')

### Simple Imputer

Three simple imputation methods will be used - constant(0), mean and median

In [67]:
# Define the imputer to be used
imputer = SimpleImputer(strategy='constant', fill_value=0, missing_values=np.nan, add_indicator=False)

# Fit the imputer on the data
imputer.fit(df_edgar_combined_numeric)

# Impute the data
df_edgar_combined_imputed = imputer.transform(df_edgar_combined_numeric)

# Fill rate of the imputed data
fill_rate_imputed = np.mean(~np.isnan(df_edgar_combined_imputed).flatten())
print(f"Missing: {sum(np.isnan(df_edgar_combined_imputed).flatten())}")

# Convert the imputed data to a dataframe
df_edgar_combined_imputed_simple_zero = pd.concat([df_edgar_combined.sym, pd.DataFrame(df_edgar_combined_imputed, columns=df_edgar_combined_numeric.columns)], axis=1)

# Display the imputed data
display(df_edgar_combined_imputed_simple_zero.head())

# Save the imputed data
df_edgar_combined_imputed_simple_zero.to_pickle(data_folder + 'df_edgar_combined_imputed_simple_zero.pkl')

Missing: 0


Unnamed: 0,sym,IncomeTaxExpenseBenefit_CY2023,NetCashProvidedByUsedInFinancingActivities_CY2023,WeightedAverageNumberOfSharesOutstandingBasic_CY2023,WeightedAverageNumberOfDilutedSharesOutstanding_CY2023,NetCashProvidedByUsedInInvestingActivities_CY2023,NetCashProvidedByUsedInOperatingActivities_CY2023,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect,ComprehensiveIncomeNetOfTax_CY2023,NetIncomeLoss_CY2023,...,Assets,LiabilitiesAndStockholdersEquity,AccumulatedOtherComprehensiveIncomeLossNetOfTax,RetainedEarningsAccumulatedDeficit,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents,StockholdersEquity,Goodwill,AssetsCurrent,LiabilitiesCurrent,CashAndCashEquivalentsAtCarryingValue
0,A,99000000.0,-930000000.0,294000000.0,296000000.0,-310000000.0,1772000000.0,537000000.0,1260000000.0,1240000000.0,...,10856000000.0,10856000000.0,-337000000.0,1090000000.0,1674000000.0,6214000000.0,3963000000.0,4203000000.0,1958000000.0,1671000000.0
1,AAL,299000000.0,-3206000000.0,653612000.0,719669000.0,-502000000.0,3803000000.0,95000000.0,513000000.0,822000000.0,...,64384000000.0,64384000000.0,-4877000000.0,-8001000000.0,703000000.0,-5500000000.0,4091000000.0,14542000000.0,24993000000.0,0.0
2,AAPL,16741000000.0,-108488000000.0,15744230000.0,15812550000.0,3705000000.0,110543000000.0,5760000000.0,96652000000.0,96995000000.0,...,337411000000.0,337411000000.0,-8960000000.0,4339000000.0,33921000000.0,74194000000.0,0.0,128416000000.0,123822000000.0,32695000000.0
3,ABT,941000000.0,-7091000000.0,1740000000.0,1749000000.0,-3133000000.0,7261000000.0,-2986000000.0,5935000000.0,5723000000.0,...,72467000000.0,72467000000.0,-8166000000.0,38011000000.0,6284000000.0,38810000000.0,23383000000.0,22376000000.0,14021000000.0,6284000000.0
4,ACGL,-873000000.0,-69000000.0,368700000.0,378800000.0,-5468000000.0,5749000000.0,225000000.0,5413000000.0,4443000000.0,...,62768000000.0,62768000000.0,-821000000.0,21405000000.0,1600000000.0,19355000000.0,0.0,0.0,0.0,993000000.0


In [64]:
# Define the imputer to be used
imputer = SimpleImputer(strategy='mean', missing_values=np.nan, add_indicator=False)

# Fit the imputer on the data
imputer.fit(df_edgar_combined_numeric)

# Impute the data
df_edgar_combined_imputed = imputer.transform(df_edgar_combined_numeric)

# Fill rate of the imputed data
fill_rate_imputed = np.mean(~np.isnan(df_edgar_combined_imputed).flatten())
print(f"Missing: {sum(np.isnan(df_edgar_combined_imputed).flatten())}")

# Convert the imputed data to a dataframe
df_edgar_combined_imputed_simple_mean = pd.concat([df_edgar_combined.sym, pd.DataFrame(df_edgar_combined_imputed, columns=df_edgar_combined_numeric.columns)], axis=1)

# Display the imputed data
display(df_edgar_combined_imputed_simple_mean.head())

# Save the imputed data
df_edgar_combined_imputed_simple_mean.to_pickle(data_folder + 'df_edgar_combined_imputed_simple_mean.pkl')


Missing: 0


Unnamed: 0,sym,IncomeTaxExpenseBenefit_CY2023,NetCashProvidedByUsedInFinancingActivities_CY2023,WeightedAverageNumberOfSharesOutstandingBasic_CY2023,WeightedAverageNumberOfDilutedSharesOutstanding_CY2023,NetCashProvidedByUsedInInvestingActivities_CY2023,NetCashProvidedByUsedInOperatingActivities_CY2023,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect,ComprehensiveIncomeNetOfTax_CY2023,NetIncomeLoss_CY2023,...,Assets,LiabilitiesAndStockholdersEquity,AccumulatedOtherComprehensiveIncomeLossNetOfTax,RetainedEarningsAccumulatedDeficit,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents,StockholdersEquity,Goodwill,AssetsCurrent,LiabilitiesCurrent,CashAndCashEquivalentsAtCarryingValue
0,A,99000000.0,-930000000.0,294000000.0,296000000.0,-310000000.0,1772000000.0,537000000.0,1260000000.0,1240000000.0,...,10856000000.0,10856000000.0,-337000000.0,1090000000.0,1674000000.0,6214000000.0,3963000000.0,4203000000.0,1958000000.0,1671000000.0
1,AAL,299000000.0,-3206000000.0,653612000.0,719669000.0,-502000000.0,3803000000.0,95000000.0,513000000.0,822000000.0,...,64384000000.0,64384000000.0,-4877000000.0,-8001000000.0,703000000.0,-5500000000.0,4091000000.0,14542000000.0,24993000000.0,4128997000.0
2,AAPL,16741000000.0,-108488000000.0,15744230000.0,15812550000.0,3705000000.0,110543000000.0,5760000000.0,96652000000.0,96995000000.0,...,337411000000.0,337411000000.0,-8960000000.0,4339000000.0,33921000000.0,74194000000.0,9693617000.0,128416000000.0,123822000000.0,32695000000.0
3,ABT,941000000.0,-7091000000.0,1740000000.0,1749000000.0,-3133000000.0,7261000000.0,-2986000000.0,5935000000.0,5723000000.0,...,72467000000.0,72467000000.0,-8166000000.0,38011000000.0,6284000000.0,38810000000.0,23383000000.0,22376000000.0,14021000000.0,6284000000.0
4,ACGL,-873000000.0,-69000000.0,368700000.0,378800000.0,-5468000000.0,5749000000.0,225000000.0,5413000000.0,4443000000.0,...,62768000000.0,62768000000.0,-821000000.0,21405000000.0,1600000000.0,19355000000.0,9693617000.0,14213190000.0,12168560000.0,993000000.0


In [65]:
# Define the imputer to be used
imputer = SimpleImputer(strategy='median', missing_values=np.nan, add_indicator=False)

# Fit the imputer on the data
imputer.fit(df_edgar_combined_numeric)

# Impute the data
df_edgar_combined_imputed = imputer.transform(df_edgar_combined_numeric)

# Fill rate of the imputed data
fill_rate_imputed = np.mean(~np.isnan(df_edgar_combined_imputed).flatten())
print(f"Missing: {sum(np.isnan(df_edgar_combined_imputed).flatten())}")

# Convert the imputed data to a dataframe
df_edgar_combined_imputed_simple_median = pd.concat([df_edgar_combined.sym, pd.DataFrame(df_edgar_combined_imputed, columns=df_edgar_combined_numeric.columns)], axis=1)

# Display the imputed data
display(df_edgar_combined_imputed_simple_median.head())

# Save the imputed data
df_edgar_combined_imputed_simple_median.to_pickle(data_folder + 'df_edgar_combined_imputed_simple_median.pkl')

Missing: 0


Unnamed: 0,sym,IncomeTaxExpenseBenefit_CY2023,NetCashProvidedByUsedInFinancingActivities_CY2023,WeightedAverageNumberOfSharesOutstandingBasic_CY2023,WeightedAverageNumberOfDilutedSharesOutstanding_CY2023,NetCashProvidedByUsedInInvestingActivities_CY2023,NetCashProvidedByUsedInOperatingActivities_CY2023,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect,ComprehensiveIncomeNetOfTax_CY2023,NetIncomeLoss_CY2023,...,Assets,LiabilitiesAndStockholdersEquity,AccumulatedOtherComprehensiveIncomeLossNetOfTax,RetainedEarningsAccumulatedDeficit,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents,StockholdersEquity,Goodwill,AssetsCurrent,LiabilitiesCurrent,CashAndCashEquivalentsAtCarryingValue
0,A,99000000.0,-930000000.0,294000000.0,296000000.0,-310000000.0,1772000000.0,537000000.0,1260000000.0,1240000000.0,...,10856000000.0,10856000000.0,-337000000.0,1090000000.0,1674000000.0,6214000000.0,3963000000.0,4203000000.0,1958000000.0,1671000000.0
1,AAL,299000000.0,-3206000000.0,653612000.0,719669000.0,-502000000.0,3803000000.0,95000000.0,513000000.0,822000000.0,...,64384000000.0,64384000000.0,-4877000000.0,-8001000000.0,703000000.0,-5500000000.0,4091000000.0,14542000000.0,24993000000.0,1100000000.0
2,AAPL,16741000000.0,-108488000000.0,15744230000.0,15812550000.0,3705000000.0,110543000000.0,5760000000.0,96652000000.0,96995000000.0,...,337411000000.0,337411000000.0,-8960000000.0,4339000000.0,33921000000.0,74194000000.0,4527000000.0,128416000000.0,123822000000.0,32695000000.0
3,ABT,941000000.0,-7091000000.0,1740000000.0,1749000000.0,-3133000000.0,7261000000.0,-2986000000.0,5935000000.0,5723000000.0,...,72467000000.0,72467000000.0,-8166000000.0,38011000000.0,6284000000.0,38810000000.0,23383000000.0,22376000000.0,14021000000.0,6284000000.0
4,ACGL,-873000000.0,-69000000.0,368700000.0,378800000.0,-5468000000.0,5749000000.0,225000000.0,5413000000.0,4443000000.0,...,62768000000.0,62768000000.0,-821000000.0,21405000000.0,1600000000.0,19355000000.0,4527000000.0,5822725000.0,4534000000.0,993000000.0


### KNN Imputer

In [32]:
# Define the imputer to be used
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

# Fit the imputer on the data
imputer.fit(df_edgar_combined_numeric)

# Impute the data
df_edgar_combined_imputed = imputer.transform(df_edgar_combined_numeric)

# Fill rate of the imputed data
fill_rate_imputed = np.mean(~np.isnan(df_edgar_combined_imputed).flatten())
print(f"Missing: {sum(np.isnan(df_edgar_combined_imputed).flatten())}")

# Convert the imputed data to a dataframe
df_edgar_combined_imputed_knn = pd.concat([df_edgar_combined.sym, pd.DataFrame(df_edgar_combined_imputed, columns=df_edgar_combined_numeric.columns)], axis=1)

# Display the imputed data
display(df_edgar_combined_imputed_knn.head())

# Save the imputed data
df_edgar_combined_imputed_knn.to_pickle(data_folder + 'df_edgar_combined_imputed_knn.pkl')


Missing: 0


Unnamed: 0,sym,IncomeTaxExpenseBenefit_CY2023,NetCashProvidedByUsedInFinancingActivities_CY2023,WeightedAverageNumberOfSharesOutstandingBasic_CY2023,WeightedAverageNumberOfDilutedSharesOutstanding_CY2023,NetCashProvidedByUsedInInvestingActivities_CY2023,NetCashProvidedByUsedInOperatingActivities_CY2023,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect,ComprehensiveIncomeNetOfTax_CY2023,NetIncomeLoss_CY2023,...,Assets,LiabilitiesAndStockholdersEquity,AccumulatedOtherComprehensiveIncomeLossNetOfTax,RetainedEarningsAccumulatedDeficit,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents,StockholdersEquity,Goodwill,AssetsCurrent,LiabilitiesCurrent,CashAndCashEquivalentsAtCarryingValue
0,A,99000000.0,-930000000.0,294000000.0,296000000.0,-310000000.0,1772000000.0,537000000.0,1260000000.0,1240000000.0,...,10856000000.0,10856000000.0,-337000000.0,1090000000.0,1674000000.0,6214000000.0,3963000000.0,4203000000.0,1958000000.0,1671000000.0
1,AAL,299000000.0,-3206000000.0,653612000.0,719669000.0,-502000000.0,3803000000.0,95000000.0,513000000.0,822000000.0,...,64384000000.0,64384000000.0,-4877000000.0,-8001000000.0,703000000.0,-5500000000.0,4091000000.0,14542000000.0,24993000000.0,3873860000.0
2,AAPL,16741000000.0,-108488000000.0,15744230000.0,15812550000.0,3705000000.0,110543000000.0,5760000000.0,96652000000.0,96995000000.0,...,337411000000.0,337411000000.0,-8960000000.0,4339000000.0,33921000000.0,74194000000.0,41102220000.0,128416000000.0,123822000000.0,32695000000.0
3,ABT,941000000.0,-7091000000.0,1740000000.0,1749000000.0,-3133000000.0,7261000000.0,-2986000000.0,5935000000.0,5723000000.0,...,72467000000.0,72467000000.0,-8166000000.0,38011000000.0,6284000000.0,38810000000.0,23383000000.0,22376000000.0,14021000000.0,6284000000.0
4,ACGL,-873000000.0,-69000000.0,368700000.0,378800000.0,-5468000000.0,5749000000.0,225000000.0,5413000000.0,4443000000.0,...,62768000000.0,62768000000.0,-821000000.0,21405000000.0,1600000000.0,19355000000.0,9063140000.0,16259110000.0,13230530000.0,993000000.0


### Iterative Imputer

In [61]:
# Define the imputer to be used
imputer = IterativeImputer(estimator=BayesianRidge(), n_nearest_features=5, initial_strategy='mean', max_iter=10, sample_posterior=True, tol=0.001,
                            missing_values=np.nan, imputation_order='ascending', verbose=1, random_state=seed)

# Fit the imputer on the data
imputer.fit(df_edgar_combined_numeric)

# Impute the data
df_edgar_combined_imputed = imputer.transform(df_edgar_combined_numeric)

# Fill rate of the imputed data
fill_rate_imputed = np.mean(~np.isnan(df_edgar_combined_imputed).flatten())
print(f"Missing: {sum(np.isnan(df_edgar_combined_imputed).flatten())}")

# Convert the imputed data to a dataframe
df_edgar_combined_imputed_iterative = pd.concat([df_edgar_combined.sym, pd.DataFrame(df_edgar_combined_imputed, columns=df_edgar_combined_numeric.columns)], axis=1)

# Display the imputed data
display(df_edgar_combined_imputed_iterative.head())

# Save the imputed data
df_edgar_combined_imputed_iterative.to_pickle(data_folder + 'df_edgar_combined_imputed_iterative.pkl')


[IterativeImputer] Completing matrix with shape (428, 42)
[IterativeImputer] Completing matrix with shape (428, 42)
Missing: 0


Unnamed: 0,sym,IncomeTaxExpenseBenefit_CY2023,NetCashProvidedByUsedInFinancingActivities_CY2023,WeightedAverageNumberOfSharesOutstandingBasic_CY2023,WeightedAverageNumberOfDilutedSharesOutstanding_CY2023,NetCashProvidedByUsedInInvestingActivities_CY2023,NetCashProvidedByUsedInOperatingActivities_CY2023,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect,ComprehensiveIncomeNetOfTax_CY2023,NetIncomeLoss_CY2023,...,Assets,LiabilitiesAndStockholdersEquity,AccumulatedOtherComprehensiveIncomeLossNetOfTax,RetainedEarningsAccumulatedDeficit,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents,StockholdersEquity,Goodwill,AssetsCurrent,LiabilitiesCurrent,CashAndCashEquivalentsAtCarryingValue
0,A,99000000.0,-930000000.0,294000000.0,296000000.0,-310000000.0,1772000000.0,537000000.0,1260000000.0,1240000000.0,...,10856000000.0,10856000000.0,-337000000.0,1090000000.0,1674000000.0,6214000000.0,3963000000.0,4203000000.0,1958000000.0,1671000000.0
1,AAL,299000000.0,-3206000000.0,653612000.0,719669000.0,-502000000.0,3803000000.0,95000000.0,513000000.0,822000000.0,...,64384000000.0,64384000000.0,-4877000000.0,-8001000000.0,703000000.0,-5500000000.0,4091000000.0,14542000000.0,24993000000.0,16853930000.0
2,AAPL,16741000000.0,-108488000000.0,15744230000.0,15812550000.0,3705000000.0,110543000000.0,5760000000.0,96652000000.0,96995000000.0,...,337411000000.0,337411000000.0,-8960000000.0,4339000000.0,33921000000.0,74194000000.0,20218730000.0,128416000000.0,123822000000.0,32695000000.0
3,ABT,941000000.0,-7091000000.0,1740000000.0,1749000000.0,-3133000000.0,7261000000.0,-2986000000.0,5935000000.0,5723000000.0,...,72467000000.0,72467000000.0,-8166000000.0,38011000000.0,6284000000.0,38810000000.0,23383000000.0,22376000000.0,14021000000.0,6284000000.0
4,ACGL,-873000000.0,-69000000.0,368700000.0,378800000.0,-5468000000.0,5749000000.0,225000000.0,5413000000.0,4443000000.0,...,62768000000.0,62768000000.0,-821000000.0,21405000000.0,1600000000.0,19355000000.0,8669049000.0,8799913000.0,13035260000.0,993000000.0


In [38]:
df_edgar_combined.head()

Unnamed: 0,sym,IncomeTaxExpenseBenefit_CY2023,NetCashProvidedByUsedInFinancingActivities_CY2023,WeightedAverageNumberOfSharesOutstandingBasic_CY2023,WeightedAverageNumberOfDilutedSharesOutstanding_CY2023,NetCashProvidedByUsedInInvestingActivities_CY2023,NetCashProvidedByUsedInOperatingActivities_CY2023,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect,ComprehensiveIncomeNetOfTax_CY2023,NetIncomeLoss_CY2023,...,Assets,LiabilitiesAndStockholdersEquity,AccumulatedOtherComprehensiveIncomeLossNetOfTax,RetainedEarningsAccumulatedDeficit,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents,StockholdersEquity,Goodwill,AssetsCurrent,LiabilitiesCurrent,CashAndCashEquivalentsAtCarryingValue
0,A,99000000.0,-930000000.0,294000000.0,296000000.0,-310000000.0,1772000000.0,537000000.0,1260000000.0,1240000000.0,...,10856000000.0,10856000000.0,-337000000.0,1090000000.0,1674000000.0,6214000000.0,3963000000.0,4203000000.0,1958000000.0,1671000000.0
1,AAL,299000000.0,-3206000000.0,653612000.0,719669000.0,-502000000.0,3803000000.0,95000000.0,513000000.0,822000000.0,...,64384000000.0,64384000000.0,-4877000000.0,-8001000000.0,703000000.0,-5500000000.0,4091000000.0,14542000000.0,24993000000.0,
2,AAPL,16741000000.0,-108488000000.0,15744230000.0,15812550000.0,3705000000.0,110543000000.0,5760000000.0,96652000000.0,96995000000.0,...,337411000000.0,337411000000.0,-8960000000.0,4339000000.0,33921000000.0,74194000000.0,,128416000000.0,123822000000.0,32695000000.0
3,ABT,941000000.0,-7091000000.0,1740000000.0,1749000000.0,-3133000000.0,7261000000.0,-2986000000.0,5935000000.0,5723000000.0,...,72467000000.0,72467000000.0,-8166000000.0,38011000000.0,6284000000.0,38810000000.0,23383000000.0,22376000000.0,14021000000.0,6284000000.0
4,ACGL,-873000000.0,-69000000.0,368700000.0,378800000.0,-5468000000.0,5749000000.0,225000000.0,5413000000.0,4443000000.0,...,62768000000.0,62768000000.0,-821000000.0,21405000000.0,1600000000.0,19355000000.0,,,,993000000.0


In [40]:
df_edgar_combined_imputed_knn.head()

Unnamed: 0,sym,IncomeTaxExpenseBenefit_CY2023,NetCashProvidedByUsedInFinancingActivities_CY2023,WeightedAverageNumberOfSharesOutstandingBasic_CY2023,WeightedAverageNumberOfDilutedSharesOutstanding_CY2023,NetCashProvidedByUsedInInvestingActivities_CY2023,NetCashProvidedByUsedInOperatingActivities_CY2023,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect,ComprehensiveIncomeNetOfTax_CY2023,NetIncomeLoss_CY2023,...,Assets,LiabilitiesAndStockholdersEquity,AccumulatedOtherComprehensiveIncomeLossNetOfTax,RetainedEarningsAccumulatedDeficit,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents,StockholdersEquity,Goodwill,AssetsCurrent,LiabilitiesCurrent,CashAndCashEquivalentsAtCarryingValue
0,A,99000000.0,-930000000.0,294000000.0,296000000.0,-310000000.0,1772000000.0,537000000.0,1260000000.0,1240000000.0,...,10856000000.0,10856000000.0,-337000000.0,1090000000.0,1674000000.0,6214000000.0,3963000000.0,4203000000.0,1958000000.0,1671000000.0
1,AAL,299000000.0,-3206000000.0,653612000.0,719669000.0,-502000000.0,3803000000.0,95000000.0,513000000.0,822000000.0,...,64384000000.0,64384000000.0,-4877000000.0,-8001000000.0,703000000.0,-5500000000.0,4091000000.0,14542000000.0,24993000000.0,3873860000.0
2,AAPL,16741000000.0,-108488000000.0,15744230000.0,15812550000.0,3705000000.0,110543000000.0,5760000000.0,96652000000.0,96995000000.0,...,337411000000.0,337411000000.0,-8960000000.0,4339000000.0,33921000000.0,74194000000.0,41102220000.0,128416000000.0,123822000000.0,32695000000.0
3,ABT,941000000.0,-7091000000.0,1740000000.0,1749000000.0,-3133000000.0,7261000000.0,-2986000000.0,5935000000.0,5723000000.0,...,72467000000.0,72467000000.0,-8166000000.0,38011000000.0,6284000000.0,38810000000.0,23383000000.0,22376000000.0,14021000000.0,6284000000.0
4,ACGL,-873000000.0,-69000000.0,368700000.0,378800000.0,-5468000000.0,5749000000.0,225000000.0,5413000000.0,4443000000.0,...,62768000000.0,62768000000.0,-821000000.0,21405000000.0,1600000000.0,19355000000.0,9063140000.0,16259110000.0,13230530000.0,993000000.0


In [31]:
# Evaluation of the imputer can be done by training a model on the imputed dataset and evaluating its performance. 
# We can compare it with other imputation methods like mean imputation, median imputation, etc.