In [1]:
import pandas as pd

In [2]:
csv = pd.read_csv("../complete_cash_flow_statements.csv",keep_default_na=True,na_values=["None", "none", "NaN", "-"],low_memory=False,dtype={"company_id": str})
csv["company_id"] = csv["company_id"].astype(str)

In [3]:
df = csv
bad_ids = df[~df["company_id"].str.fullmatch(r"\d{4}")]
print("Bad company_ids:")
print(bad_ids['company_id'])

Bad company_ids:
64      03008
65      03008
66      03008
67      03008
68      03008
        ...  
9657    03039
9658    03039
9659    03039
9660    03039
9661    03039
Name: company_id, Length: 378, dtype: object


In [4]:
# Ensure company_id is string
csv["company_id"] = csv["company_id"].astype(str)

# Detect bad IDs (not exactly 4 digits)
bad_mask = ~csv["company_id"].str.fullmatch(r"\d{4}")

# Print them
print("Bad company_ids:")
print(csv.loc[bad_mask, "company_id"])

# Fix only the bad ones
csv.loc[bad_mask, "company_id"] = csv.loc[bad_mask, "company_id"].str.zfill(4)

Bad company_ids:
64      03008
65      03008
66      03008
67      03008
68      03008
        ...  
9657    03039
9658    03039
9659    03039
9660    03039
9661    03039
Name: company_id, Length: 378, dtype: object


In [5]:
# Ensure company_id is string type
csv["company_id"] = csv["company_id"].astype(str)

# Find malformed IDs that are not 4-digit strings
bad_ids = csv[~csv["company_id"].str.fullmatch(r"\d{4}")]

# Report bad IDs
print("Bad company_ids:")
print(bad_ids["company_id"])

Bad company_ids:
64      03008
65      03008
66      03008
67      03008
68      03008
        ...  
9657    03039
9658    03039
9659    03039
9660    03039
9661    03039
Name: company_id, Length: 378, dtype: object


In [6]:
df = csv

In [7]:
import pandas as pd

# Step 1: Separate Value and YoY rows
df_value = df[df["Year/Type"].str.contains("Value", na=False)].copy()
df_yoy = df[df["Year/Type"].str.contains("YoY %", na=False)].copy()

# Step 2: Extract full date from "Year/Type" (e.g., '31 Mar 2024')
df_value["Fiscal Date"] = df_value["Year/Type"].str.extract(r"(\d{1,2} \w{3} \d{4})")
df_yoy["Fiscal Date"] = df_yoy["Year/Type"].str.extract(r"(\d{1,2} \w{3} \d{4})")

# Step 3: Merge on 'company_id' and 'Fiscal Date'
merged = pd.merge(
    df_value,
    df_yoy,
    on=["company_id", "Fiscal Date"],
    suffixes=("", "_yoy"),
    how="left"
)

# Drop helper columns
merged = merged.drop(columns=["Year/Type", "Year/Type_yoy"])
merged = merged.rename(columns={"Fiscal Date": "Fiscal Date"})

merged.head()

Unnamed: 0,company_id,Cash Receipts,Cash Payments,Net Income/Starting Line,Depreciation/Depletion,Depreciation,Amortization,Amortization of Acquisition Costs,Amortization of Intangibles,Deferred Taxes,...,Unearned Premiums_yoy,Insurance Reserves_yoy,"Investment Securities, Gains/Losses_yoy",Sale/Issuance of Common/Preferred_yoy,Loans Origination - Operating_yoy,Loan Loss Provision_yoy,"Loans, Gains/Losses_yoy",Foreclosed Real Estate_yoy,Deposits_yoy,Repurchase/Retirement of Preferred_yoy
0,5274,,,121.86,7.3,7.3,2.44,,2.44,,...,,,,,,,,,,
1,5274,,,61.43,8.53,8.53,2.31,,2.31,,...,,,,,,,,,,
2,5274,,,97.17,8.02,8.02,1.88,,1.88,,...,,,,,,,,,,
3,5274,,,177.35,7.63,7.63,1.53,,1.53,,...,,,,,,,,,,
4,5274,,,95.75,7.39,7.39,1.7,,1.7,,...,,,,,,,,,,


In [8]:
# Find all _yoy columns
yoy_cols = [col for col in merged.columns if col.endswith('_yoy')]

# Identify _yoy columns that do NOT contain any '%' values
cols_to_drop = [
    col for col in yoy_cols
    if not merged[col].astype(str).str.contains('%', na=False).any()
]

# Drop them
merged = merged.drop(columns=cols_to_drop)

print(f"✅ Dropped {len(cols_to_drop)} _yoy columns without '%' signs.")

✅ Dropped 13 _yoy columns without '%' signs.


In [9]:
len(merged.columns)

155

In [10]:
merged

Unnamed: 0,company_id,Cash Receipts,Cash Payments,Net Income/Starting Line,Depreciation/Depletion,Depreciation,Amortization,Amortization of Acquisition Costs,Amortization of Intangibles,Deferred Taxes,...,Cash Interest Paid_yoy,Cash Taxes Paid_yoy,Foreign Exchange Effects_yoy,Deferred Policy Acquisition Costs_yoy,Reinsurance Receivable_yoy,Reinsurance Payable_yoy,"Investment Securities, Gains/Losses_yoy",Loan Loss Provision_yoy,"Loans, Gains/Losses_yoy",Foreclosed Real Estate_yoy
0,5274,,,121.86,7.30,7.30,2.44,,2.44,,...,-24.26%,10.38%,,,,,,,,
1,5274,,,61.43,8.53,8.53,2.31,,2.31,,...,-59.45%,-26.52%,,,,,,,,
2,5274,,,97.17,8.02,8.02,1.88,,1.88,,...,-23.04%,-13.62%,,,,,,,,
3,5274,,,177.35,7.63,7.63,1.53,,1.53,,...,103.61%,-14.58%,,,,,,,,
4,5274,,,95.75,7.39,7.39,1.70,,1.70,,...,116.07%,67.81%,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,5292,,,18.78,21.27,21.27,,,,,...,-60.00%,-35.45%,-252.11%,,,,,,,
4839,5292,,,67.88,18.29,18.29,,,,,...,-16.67%,0.99%,0.00%,,,,,,,
4840,5292,,,139.74,15.58,15.58,,,,,...,-45.45%,70.85%,0.00%,,,,,,,
4841,5292,,,114.79,12.71,12.71,,,,,...,-63.33%,48.93%,,,,,,,,


In [None]:
bursa_registration = pd.read_csv("../matched_companies_from_ssm.csv",dtype={"company_id": str,"companyNo": str})
bursa_registration["companyNo"] = (
    bursa_registration["companyNo"]
    .astype(str)
    .str.replace(r"\.0$", "", regex=True)
    .str.strip()
)
bursa_registration.head()



Unnamed: 0,company_name,name_db,companyNo,oldCompanyNo,company_type
0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,Company
1,99 SPEED MART RETAIL HOLDINGS BERHAD,99 SPEED MART RETAIL HOLDINGS BERHAD,202301017784.0,1511706-T,Company
2,A-RANK BERHAD,A-RANK ASSOCIATES SDN. BHD.,199701029773.0,445272-P,Company
3,ABLE GLOBAL BERHAD,ABLE GLOBAL (MM2H) SDN. BHD.,199501013751.0,342952-P,Company
4,ABLEGROUP BERHAD,ABLEGROUP BERHAD,200401015685.0,654188-H,Company


In [12]:
company_id = pd.read_csv("../../../list_bursa_ids/bursa_company_list.csv", dtype={"company_id": str})
company_id["company_id"] = company_id["company_id"].str.strip().str.zfill(4)


In [13]:
merge = pd.merge(bursa_registration,company_id,on="company_name",how="inner")
merge

Unnamed: 0,company_name,name_db,companyNo,oldCompanyNo,company_type,company_id
0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,Company,5250
1,99 SPEED MART RETAIL HOLDINGS BERHAD,99 SPEED MART RETAIL HOLDINGS BERHAD,202301017784.0,1511706-T,Company,5326
2,A-RANK BERHAD,A-RANK ASSOCIATES SDN. BHD.,199701029773.0,445272-P,Company,7214
3,ABLE GLOBAL BERHAD,ABLE GLOBAL (MM2H) SDN. BHD.,199501013751.0,342952-P,Company,7167
4,ABLEGROUP BERHAD,ABLEGROUP BERHAD,200401015685.0,654188-H,Company,7086
...,...,...,...,...,...,...
1073,TOTAL DYNAMIC HOLDINGS BERHAD,TOTAL DYNAMIC HOLDINGS BERHAD,202201036425.0,1482122-A,Company,03058
1074,TP TEC HOLDING BERHAD,TP TEC HOLDING BERHAD,202401020280.0,1566129-V,Company,03061
1075,TSIC BERHAD,TSIC BERHAD,202301036647.0,1530570-U,Company,03062
1076,UCI RESOURCES BERHAD,UCI RESOURCES BERHAD,202001015323.0,1371643-X,Company,03043


In [14]:
len(merge["company_id"].drop_duplicates())

1078

In [15]:
merge_copy = merge.copy()

In [16]:
merge = merge.drop(columns="company_type")
merge = merge.rename(columns={
    "company_name": "company_name_bursa",
    "name_db": "company_name_api",
    "companyNo": "registration_number",
    "oldCompanyNo": "old_registration_number"
})
# merge.to_csv("bursa_company_registra.csv", index=False)

In [17]:
merge

Unnamed: 0,company_name_bursa,company_name_api,registration_number,old_registration_number,company_id
0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250
1,99 SPEED MART RETAIL HOLDINGS BERHAD,99 SPEED MART RETAIL HOLDINGS BERHAD,202301017784.0,1511706-T,5326
2,A-RANK BERHAD,A-RANK ASSOCIATES SDN. BHD.,199701029773.0,445272-P,7214
3,ABLE GLOBAL BERHAD,ABLE GLOBAL (MM2H) SDN. BHD.,199501013751.0,342952-P,7167
4,ABLEGROUP BERHAD,ABLEGROUP BERHAD,200401015685.0,654188-H,7086
...,...,...,...,...,...
1073,TOTAL DYNAMIC HOLDINGS BERHAD,TOTAL DYNAMIC HOLDINGS BERHAD,202201036425.0,1482122-A,03058
1074,TP TEC HOLDING BERHAD,TP TEC HOLDING BERHAD,202401020280.0,1566129-V,03061
1075,TSIC BERHAD,TSIC BERHAD,202301036647.0,1530570-U,03062
1076,UCI RESOURCES BERHAD,UCI RESOURCES BERHAD,202001015323.0,1371643-X,03043


In [18]:
merged

Unnamed: 0,company_id,Cash Receipts,Cash Payments,Net Income/Starting Line,Depreciation/Depletion,Depreciation,Amortization,Amortization of Acquisition Costs,Amortization of Intangibles,Deferred Taxes,...,Cash Interest Paid_yoy,Cash Taxes Paid_yoy,Foreign Exchange Effects_yoy,Deferred Policy Acquisition Costs_yoy,Reinsurance Receivable_yoy,Reinsurance Payable_yoy,"Investment Securities, Gains/Losses_yoy",Loan Loss Provision_yoy,"Loans, Gains/Losses_yoy",Foreclosed Real Estate_yoy
0,5274,,,121.86,7.30,7.30,2.44,,2.44,,...,-24.26%,10.38%,,,,,,,,
1,5274,,,61.43,8.53,8.53,2.31,,2.31,,...,-59.45%,-26.52%,,,,,,,,
2,5274,,,97.17,8.02,8.02,1.88,,1.88,,...,-23.04%,-13.62%,,,,,,,,
3,5274,,,177.35,7.63,7.63,1.53,,1.53,,...,103.61%,-14.58%,,,,,,,,
4,5274,,,95.75,7.39,7.39,1.70,,1.70,,...,116.07%,67.81%,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,5292,,,18.78,21.27,21.27,,,,,...,-60.00%,-35.45%,-252.11%,,,,,,,
4839,5292,,,67.88,18.29,18.29,,,,,...,-16.67%,0.99%,0.00%,,,,,,,
4840,5292,,,139.74,15.58,15.58,,,,,...,-45.45%,70.85%,0.00%,,,,,,,
4841,5292,,,114.79,12.71,12.71,,,,,...,-63.33%,48.93%,,,,,,,,


In [19]:
merged["company_id"].nunique()

1033

In [20]:
merge["company_id"] = merge["company_id"].str.strip()
merged["company_id"] = merged["company_id"].str.strip()

In [21]:
merge["company_id"] = merge["company_id"].astype(str)
merged["company_id"] = merged["company_id"].astype(str)
final_merge = merge.merge(merged, on="company_id", how="inner")

In [22]:
final_merge['company_id'].nunique()

1033

In [23]:
(final_merge.isnull().mean() * 100).sort_values(ascending=False).to_frame("missing_%").style.background_gradient(cmap='Reds')

Unnamed: 0,missing_%
Insurance Reserves,100.0
Reported Cash from Operating Activities,100.0
Repurchase/Retirement of Preferred,100.0
Deposits,100.0
Loans Origination - Operating,100.0
Sale/Issuance of Common/Preferred,100.0
Unearned Premiums,100.0
Loss Adjustment,100.0
Accounting Change,100.0
Reported Cash from Financing Activities,100.0


In [24]:
df=final_merge

In [25]:
df

Unnamed: 0,company_name_bursa,company_name_api,registration_number,old_registration_number,company_id,Cash Receipts,Cash Payments,Net Income/Starting Line,Depreciation/Depletion,Depreciation,...,Cash Interest Paid_yoy,Cash Taxes Paid_yoy,Foreign Exchange Effects_yoy,Deferred Policy Acquisition Costs_yoy,Reinsurance Receivable_yoy,Reinsurance Payable_yoy,"Investment Securities, Gains/Losses_yoy",Loan Loss Provision_yoy,"Loans, Gains/Losses_yoy",Foreclosed Real Estate_yoy
0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3058.75,-2881.78,,,,...,16.71%,55.88%,,,,,,,,
1,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4358.27,-3984.39,,,,...,-18.73%,-45.51%,,,,,,,,
2,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4010.94,-3459.32,,,,...,-14.89%,-19.42%,,,,,,,,
3,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3041.35,-2705.03,,,,...,-28.92%,-43.84%,,,,,,,,
4,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,2732.29,-2454.78,,,,...,-157.40%,-33.04%,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,UCI RESOURCES BERHAD,UCI RESOURCES BERHAD,202001015323.0,1371643-X,03043,,,2.26,1.22,1.22,...,-15.00%,-68.94%,,,,,,,,
4839,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,4.55,0.99,0.99,...,-44.66%,-81.67%,0.00%,,,,,,,
4840,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,-12.35,1.42,1.42,...,128.89%,-72.48%,0.00%,,,,,,,
4841,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,5.59,0.89,0.89,...,57.89%,938.10%,0.00%,,,,,,,


In [26]:
df["Fiscal Date"]

0       31 Dec 2024
1       31 Dec 2023
2       31 Dec 2022
3       31 Dec 2021
4       31 Dec 2020
           ...     
4838    31 Dec 2020
4839    30 Jun 2024
4840    30 Jun 2023
4841    31 Dec 2021
4842    31 Dec 2020
Name: Fiscal Date, Length: 4843, dtype: object

In [27]:
# ── Your DataFrame ──
df = df.copy()
df.columns = df.columns.str.replace("-", " ").str.strip().str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_")
df

Unnamed: 0,company_name_bursa,company_name_api,registration_number,old_registration_number,company_id,cash_receipts,cash_payments,net_income/starting_line,depreciation/depletion,depreciation,...,cash_interest_paid_yoy,cash_taxes_paid_yoy,foreign_exchange_effects_yoy,deferred_policy_acquisition_costs_yoy,reinsurance_receivable_yoy,reinsurance_payable_yoy,"investment_securities,_gains/losses_yoy",loan_loss_provision_yoy,"loans,_gains/losses_yoy",foreclosed_real_estate_yoy
0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3058.75,-2881.78,,,,...,16.71%,55.88%,,,,,,,,
1,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4358.27,-3984.39,,,,...,-18.73%,-45.51%,,,,,,,,
2,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4010.94,-3459.32,,,,...,-14.89%,-19.42%,,,,,,,,
3,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3041.35,-2705.03,,,,...,-28.92%,-43.84%,,,,,,,,
4,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,2732.29,-2454.78,,,,...,-157.40%,-33.04%,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,UCI RESOURCES BERHAD,UCI RESOURCES BERHAD,202001015323.0,1371643-X,03043,,,2.26,1.22,1.22,...,-15.00%,-68.94%,,,,,,,,
4839,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,4.55,0.99,0.99,...,-44.66%,-81.67%,0.00%,,,,,,,
4840,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,-12.35,1.42,1.42,...,128.89%,-72.48%,0.00%,,,,,,,
4841,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,5.59,0.89,0.89,...,57.89%,938.10%,0.00%,,,,,,,


In [28]:
df.columns

Index(['company_name_bursa', 'company_name_api', 'registration_number',
       'old_registration_number', 'company_id', 'cash_receipts',
       'cash_payments', 'net_income/starting_line', 'depreciation/depletion',
       'depreciation',
       ...
       'cash_interest_paid_yoy', 'cash_taxes_paid_yoy',
       'foreign_exchange_effects_yoy', 'deferred_policy_acquisition_costs_yoy',
       'reinsurance_receivable_yoy', 'reinsurance_payable_yoy',
       'investment_securities,_gains/losses_yoy', 'loan_loss_provision_yoy',
       'loans,_gains/losses_yoy', 'foreclosed_real_estate_yoy'],
      dtype='object', length=159)

In [29]:
import re
df.columns = (
    df.columns
    .str.replace("-", " ", regex=False)        # Replace hyphens with spaces
    .str.replace("/", " ", regex=False) 
    .str.strip()
    .str.lower()
    .str.replace(r"[^\w\s]", "", regex=True)   # Remove non-word characters like . , &
    .str.replace(r"\s+", "_", regex=True)      # Convert spaces to single underscore
    .str.replace(r"_+", "_", regex=True)       # Collapse multiple underscores to one
)

In [30]:
df

Unnamed: 0,company_name_bursa,company_name_api,registration_number,old_registration_number,company_id,cash_receipts,cash_payments,net_income_starting_line,depreciation_depletion,depreciation,...,cash_interest_paid_yoy,cash_taxes_paid_yoy,foreign_exchange_effects_yoy,deferred_policy_acquisition_costs_yoy,reinsurance_receivable_yoy,reinsurance_payable_yoy,investment_securities_gains_losses_yoy,loan_loss_provision_yoy,loans_gains_losses_yoy,foreclosed_real_estate_yoy
0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3058.75,-2881.78,,,,...,16.71%,55.88%,,,,,,,,
1,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4358.27,-3984.39,,,,...,-18.73%,-45.51%,,,,,,,,
2,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4010.94,-3459.32,,,,...,-14.89%,-19.42%,,,,,,,,
3,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3041.35,-2705.03,,,,...,-28.92%,-43.84%,,,,,,,,
4,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,2732.29,-2454.78,,,,...,-157.40%,-33.04%,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,UCI RESOURCES BERHAD,UCI RESOURCES BERHAD,202001015323.0,1371643-X,03043,,,2.26,1.22,1.22,...,-15.00%,-68.94%,,,,,,,,
4839,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,4.55,0.99,0.99,...,-44.66%,-81.67%,0.00%,,,,,,,
4840,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,-12.35,1.42,1.42,...,128.89%,-72.48%,0.00%,,,,,,,
4841,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,5.59,0.89,0.89,...,57.89%,938.10%,0.00%,,,,,,,


In [31]:
# Check for duplicate column names
duplicates = df.columns[df.columns.duplicated()]
print("Duplicate columns:", duplicates.tolist())
df = df.loc[:, ~df.columns.duplicated()]

Duplicate columns: []


In [32]:
df.columns.tolist()

['company_name_bursa',
 'company_name_api',
 'registration_number',
 'old_registration_number',
 'company_id',
 'cash_receipts',
 'cash_payments',
 'net_income_starting_line',
 'depreciation_depletion',
 'depreciation',
 'amortization',
 'amortization_of_acquisition_costs',
 'amortization_of_intangibles',
 'deferred_taxes',
 'non_cash_items',
 'discontinued_operations',
 'equity_in_net_earnings_loss',
 'unusual_items',
 'other_non_cash_items',
 'changes_in_working_capital',
 'other_assets_liabilities_net',
 'other_assets',
 'other_liabilities',
 'accounts_receivable',
 'inventories',
 'accounts_payable',
 'payable_accrued',
 'other_operating_cash_flow',
 'cash_from_operating_activities',
 'capital_expenditures',
 'purchase_acquisition_of_intangibles',
 'purchase_of_fixed_assets',
 'other_investing_cash_flow_items_total',
 'sale_maturity_of_investment',
 'purchase_of_investments',
 'sale_of_intangible_assets',
 'sale_of_business',
 'investment_net',
 'acquisition_of_business',
 'sale_of

In [33]:
# Get null ratios sorted
null_ratios = df.isnull().mean().sort_values(ascending=False)

# Convert to DataFrame with formatted percentage
summary_df = pd.DataFrame({
    "column": null_ratios.index,
    "null_ratio": (null_ratios * 100).map("{:.2f}%".format)
})

# # Save to CSV
summary_df.to_csv("../../column_percentages/balance_null_ratio_summary.csv", index=False)
summary_df.head()

Unnamed: 0,column,null_ratio
insurance_reserves,insurance_reserves,100.00%
reported_cash_from_operating_activities,reported_cash_from_operating_activities,100.00%
repurchase_retirement_of_preferred,repurchase_retirement_of_preferred,100.00%
deposits,deposits,100.00%
loans_origination_operating,loans_origination_operating,100.00%


In [34]:
# desired_columns = [
#     "old_registration_number",
#     "other_current_assets_total",
#     "total_current_assets",
#     "other_current_liabilities_total",
#     "total_current_liabilities",
#     "other_long_term_liabilities",
#     "retained_earnings_accumulated_deficit",
#     "total_equity",
#     "fiscal_date",
#     "registration_number"
# ]
# df = df[[col for col in desired_columns if col in df.columns]]

In [35]:
df.isnull().sum()

company_name_bursa                           0
company_name_api                           121
registration_number                        121
old_registration_number                    121
company_id                                   0
                                          ... 
reinsurance_payable_yoy                   4830
investment_securities_gains_losses_yoy    4754
loan_loss_provision_yoy                   4790
loans_gains_losses_yoy                    4788
foreclosed_real_estate_yoy                4840
Length: 159, dtype: int64

In [36]:
df

Unnamed: 0,company_name_bursa,company_name_api,registration_number,old_registration_number,company_id,cash_receipts,cash_payments,net_income_starting_line,depreciation_depletion,depreciation,...,cash_interest_paid_yoy,cash_taxes_paid_yoy,foreign_exchange_effects_yoy,deferred_policy_acquisition_costs_yoy,reinsurance_receivable_yoy,reinsurance_payable_yoy,investment_securities_gains_losses_yoy,loan_loss_provision_yoy,loans_gains_losses_yoy,foreclosed_real_estate_yoy
0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3058.75,-2881.78,,,,...,16.71%,55.88%,,,,,,,,
1,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4358.27,-3984.39,,,,...,-18.73%,-45.51%,,,,,,,,
2,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4010.94,-3459.32,,,,...,-14.89%,-19.42%,,,,,,,,
3,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3041.35,-2705.03,,,,...,-28.92%,-43.84%,,,,,,,,
4,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,2732.29,-2454.78,,,,...,-157.40%,-33.04%,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,UCI RESOURCES BERHAD,UCI RESOURCES BERHAD,202001015323.0,1371643-X,03043,,,2.26,1.22,1.22,...,-15.00%,-68.94%,,,,,,,,
4839,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,4.55,0.99,0.99,...,-44.66%,-81.67%,0.00%,,,,,,,
4840,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,-12.35,1.42,1.42,...,128.89%,-72.48%,0.00%,,,,,,,
4841,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,5.59,0.89,0.89,...,57.89%,938.10%,0.00%,,,,,,,


In [37]:
df = df.dropna(subset=["registration_number"])

In [38]:
df['registration_number'].nunique()

1007

In [39]:
df.isnull().sum()

company_name_bursa                           0
company_name_api                             0
registration_number                          0
old_registration_number                      0
company_id                                   0
                                          ... 
reinsurance_payable_yoy                   4709
investment_securities_gains_losses_yoy    4638
loan_loss_provision_yoy                   4672
loans_gains_losses_yoy                    4672
foreclosed_real_estate_yoy                4719
Length: 159, dtype: int64

In [40]:
df

Unnamed: 0,company_name_bursa,company_name_api,registration_number,old_registration_number,company_id,cash_receipts,cash_payments,net_income_starting_line,depreciation_depletion,depreciation,...,cash_interest_paid_yoy,cash_taxes_paid_yoy,foreign_exchange_effects_yoy,deferred_policy_acquisition_costs_yoy,reinsurance_receivable_yoy,reinsurance_payable_yoy,investment_securities_gains_losses_yoy,loan_loss_provision_yoy,loans_gains_losses_yoy,foreclosed_real_estate_yoy
0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3058.75,-2881.78,,,,...,16.71%,55.88%,,,,,,,,
1,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4358.27,-3984.39,,,,...,-18.73%,-45.51%,,,,,,,,
2,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,4010.94,-3459.32,,,,...,-14.89%,-19.42%,,,,,,,,
3,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,3041.35,-2705.03,,,,...,-28.92%,-43.84%,,,,,,,,
4,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701.0,1058531-W,5250,2732.29,-2454.78,,,,...,-157.40%,-33.04%,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,UCI RESOURCES BERHAD,UCI RESOURCES BERHAD,202001015323.0,1371643-X,03043,,,2.26,1.22,1.22,...,-15.00%,-68.94%,,,,,,,,
4839,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,4.55,0.99,0.99,...,-44.66%,-81.67%,0.00%,,,,,,,
4840,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,-12.35,1.42,1.42,...,128.89%,-72.48%,0.00%,,,,,,,
4841,UNI WALL APS HOLDINGS BERHAD,UNI WALL APS HOLDINGS BERHAD,201801007506.0,1269520-X,03017,,,5.59,0.89,0.89,...,57.89%,938.10%,0.00%,,,,,,,


In [41]:
from sqlalchemy import create_engine
import pandas as pd
from dotenv import load_dotenv
import os

# ── Load environment variables ──
load_dotenv()

# ── Read DB credentials ──
user = os.getenv("PG_USER")
password = os.getenv("PG_PASSWORD")
host = os.getenv("PG_HOST")
port = os.getenv("PG_PORT")
database = os.getenv("PG_DATABASE")

# ── Create connection string ──
connection_url = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
engine = create_engine(connection_url)

# ── Your DataFrame ──
df = df.copy()

# ── Push to DB ──
df.to_sql("public_complete_cash_flow", engine, schema="public", index=False, if_exists="replace")

print("✅ Uploaded to 'public_complete_cash_flow' with normalized column names.")

✅ Uploaded to 'public_complete_cash_flow' with normalized column names.
