In [66]:
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [67]:
csv = pd.read_csv("../complete_balance_sheets.csv",keep_default_na=True,na_values=["None", "none", "NaN", "-"],low_memory=False,dtype={"company_id": str})
csv["company_id"] = csv["company_id"].astype(str)

In [68]:
df = csv
bad_ids = df[~df["company_id"].str.fullmatch(r"\d{4}")]

In [69]:
# Ensure company_id is string
csv["company_id"] = csv["company_id"].astype(str)

# Detect bad IDs (not exactly 4 digits)
bad_mask = ~csv["company_id"].str.fullmatch(r"\d{4}")

# Fix only the bad ones
csv.loc[bad_mask, "company_id"] = csv.loc[bad_mask, "company_id"].str.zfill(4)

In [70]:
df = csv

In [71]:
import pandas as pd

# Step 1: Separate Value and YoY rows
df_value = df[df["Year/Type"].str.contains("Value", na=False)].copy()
df_yoy = df[df["Year/Type"].str.contains("YoY %", na=False)].copy()

# Step 2: Extract full date from "Year/Type" (e.g., '31 Mar 2024')
df_value["Fiscal Date"] = df_value["Year/Type"].str.extract(r"(\d{1,2} \w{3} \d{4})")
df_yoy["Fiscal Date"] = df_yoy["Year/Type"].str.extract(r"(\d{1,2} \w{3} \d{4})")

# Step 3: Merge on 'company_id' and 'Fiscal Date'
merged = pd.merge(
    df_value,
    df_yoy,
    on=["company_id", "Fiscal Date"],
    suffixes=("", "_yoy"),
    how="left"
)

# Drop helper columns
merged = merged.drop(columns=["Year/Type", "Year/Type_yoy"])
merged = merged.rename(columns={"Fiscal Date": "Fiscal Date"})

In [72]:
# Find all _yoy columns
yoy_cols = [col for col in merged.columns if col.endswith('_yoy')]

# Identify _yoy columns that do NOT contain any '%' values
cols_to_drop = [
    col for col in yoy_cols
    if not merged[col].astype(str).str.contains('%', na=False).any()
]

# Drop them
merged = merged.drop(columns=cols_to_drop)

In [73]:
bursa_registration = pd.read_csv("../matched_companies_from_ssm.csv",dtype={"company_id": str,"companyNo": str})

bursa_registration["companyNo"] = (
    bursa_registration["companyNo"]
    .astype(str)
    .str.replace(r"\.0$", "", regex=True)
    .str.strip()
)
bursa_registration.head()


Unnamed: 0,company_name,name_db,companyNo,oldCompanyNo,company_type,match_score
0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,7-ELEVEN MALAYSIA HOLDINGS BERHAD,201301028701,1058531-W,Company,100.0
1,99 SPEED MART RETAIL HOLDINGS BERHAD,99 SPEED MART RETAIL HOLDINGS BERHAD,202301017784,1511706-T,Company,100.0
2,A-RANK BERHAD,A-RANK BERHAD,200301031200,633621-X,Company,100.0
3,ABLE GLOBAL BERHAD,ABLE GLOBAL BERHAD,200001029963,532570-V,Company,100.0
4,ABLEGROUP BERHAD,ABLEGROUP BERHAD,200401015685,654188-H,Company,100.0


In [74]:
company_id = pd.read_csv("../../../list_bursa_ids/bursa_company_list.csv", dtype={"company_id": str})
company_id["company_id"] = company_id["company_id"].str.strip().str.zfill(4)


In [75]:
merge = pd.merge(bursa_registration,company_id,on="company_name",how="inner")

In [76]:
merge = merge.drop(columns="company_type")
merge = merge.rename(columns={
    "company_name": "company_name_bursa",
    "name_db": "company_name_api",
    "companyNo": "registration_number",
    "oldCompanyNo": "old_registration_number"
})

In [77]:
merge["company_id"] = merge["company_id"].str.strip()
merged["company_id"] = merged["company_id"].str.strip()

In [78]:
merge["company_id"] = merge["company_id"].astype(str)
merged["company_id"] = merged["company_id"].astype(str)
final_merge = merge.merge(merged, on="company_id", how="inner")

In [79]:
df=final_merge

In [80]:
## Column Name cleanup

df.columns = df.columns.str.replace("-", " ").str.strip().str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_")
df.columns = (
    df.columns
    .str.replace("-", " ", regex=False)        # Replace hyphens with spaces
    .str.replace("/", " ", regex=False) 
    .str.strip()
    .str.lower()
    .str.replace(r"[^\w\s]", "", regex=True)   # Remove non-word characters like . , &
    .str.replace(r"\s+", "_", regex=True)      # Convert spaces to single underscore
    .str.replace(r"_+", "_", regex=True)       # Collapse multiple underscores to one
)

In [81]:
# Normalize columns again for names like investments_ 
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(r"[^\w\s]", "", regex=True)
    .str.replace(r"\s+", "_", regex=True)
)


In [82]:
# Drop duplicate columns (if any)
duplicates = df.columns[df.columns.duplicated()]
if not duplicates.empty:
    print("Duplicate columns:", duplicates.tolist())
df = df.loc[:, ~df.columns.duplicated()]

Duplicate columns: ['intangibles_net', 'goodwill_net', 'other_current_liabilities_total', 'intangibles_net_yoy', 'goodwill_net_yoy']


In [83]:
# Drop all *_yoy columns before calculating null ratios
yoy_columns = [col for col in df.columns if col.endswith('_yoy')]
df = df.drop(columns=yoy_columns)

In [84]:
# Get null ratios sorted FOR ALL
null_ratios = df.isnull().mean().sort_values(ascending=False)

# Convert to DataFrame with formatted percentage
summary_df = pd.DataFrame({
    "column": null_ratios.index,
    "null_ratio": (null_ratios * 100).map("{:.2f}%".format)
})

# # Save to CSV
summary_df.to_csv("../../column_percentages/balance_null_ratio_summary.csv", index=False)

summary_df.head()

Unnamed: 0,column,null_ratio
other_islamic_deposits,other_islamic_deposits,100.00%
long_term_debt_maturing_in_year_10,long_term_debt_maturing_in_year_10,100.00%
reported_return_on_equity,reported_return_on_equity,100.00%
islamic_receivables,islamic_receivables,100.00%
islamic_section_supplemental,islamic_section_supplemental,100.00%


In [85]:
# Set up the column map for total_assets calculation
# By converting these cols to numeric 
column_map = {
    "total_current_assets": "total_current_assets",
    "property_plant_equipment_total_net": "property_plant_equipment_total_net",
    "intangibles_net": "intangibles_net",
    "long_term_investments": "long_term_investments",
    "note_receivable_long_term": "note_receivable_long_term",
    "other_long_term_assets": "other_long_term_assets"
}
cols = [
    "total_current_assets",
    "long_term_investments",
    "note_receivable_long_term",
    "other_long_term_assets",
    "intangibles_net"
]


for col in column_map.values():
    if col not in df.columns:
        print(f"❌ Missing column: {col}")
        df[col] = 0
    else:
        # Remove commas before conversion
        df[col] = (
            df[col].astype(str)
            .str.replace(",", "")
            .pipe(pd.to_numeric, errors="coerce")
            .fillna(0)
        )

#------------ How much of the dataset is missing and what the columns names after cleaning is---

df["registration_number"] = df["registration_number"].astype(str)
df["old_registration_number"] = df["old_registration_number"].astype(str)


In [86]:
# ------------- Total Asset Calculation Bursa



# This is to ensure if that row_data has that col, it will clean the data points with missing values for calc 
if "other_long_term_assets_total" in df.columns:
    df["other_long_term_assets"] = (
        df["other_long_term_assets_total"]
        .astype(str)
        .str.replace(",", "", regex=False)
        .pipe(pd.to_numeric, errors="coerce")
        .fillna(0)
    )
else:
    df["other_long_term_assets"] = 0

# Change to numeric to be used for calc as its a string currently
df["other_long_term_assets"] = pd.to_numeric(df["other_long_term_assets"], errors="coerce").fillna(0)

# Final columns to keep — keep only 'total_assets', no recalculation from components
final_columns = [
    "registration_number",
    "old_registration_number",
    "company_id",
    "fiscal_date",
    "fiscal_year",
    "total_liabilities",
    "retained_earnings_accumulated_deficit",
    "total_equity",
    "total_assets"  # ✅ only this is used, don't recalculate
]
# 1. Define asset components to compute total_assets
asset_components = [
    "total_current_assets",
    "long_term_investments",
    "note_receivable_long_term",
    "other_long_term_assets",
    "intangibles_net",
    "property_plant_equipment_total_net",
    "goodwill_net"
]
# 2. Ensure all component columns exist and are numeric
for col in asset_components:
    if col not in df.columns:
        df[col] = 0  # fallback if missing
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(",", "", regex=False)
        .str.strip()
        .pipe(pd.to_numeric, errors="coerce")
        .fillna(0)
    )

# 1. Compute total_assets from components (fallback only)
df["computed_total_assets"] = df[asset_components].sum(axis=1)


# 2. Clean scraped total_assets
# in the 15 companies, only 2 had these the rest were computed but accurate to bursa
df["total_assets"] = (
    df["total_assets"]
    .astype(str)
    .str.replace(",", "", regex=False)
    .pipe(pd.to_numeric, errors="coerce")
) 

# 3. Only fill in missing scraped total_assets
df["total_assets"] = df["total_assets"].fillna(df["computed_total_assets"])

# 4. Drop the helper
df.drop(columns=["computed_total_assets"], inplace=True)

# Convert fiscal_date to datetime for sorting
df["fiscal_date"] = pd.to_datetime(df["fiscal_date"], format="%d %b %Y", errors="coerce")

# Sort descending by fiscal_date within each company_id
df = df.sort_values(by=["company_id", "fiscal_date"], ascending=[True, False])

# Convert back to string format expected by DB
df["fiscal_date"] = df["fiscal_date"].dt.strftime("%d %b %Y")


# Filter only those that exist in df
df = df[[col for col in final_columns if col in df.columns]]

In [87]:
# Get null ratios sorted
null_ratios = df.isnull().mean().sort_values(ascending=False)

# Convert to DataFrame with formatted percentage
summary_df = pd.DataFrame({
    "column": null_ratios.index,
    "null_ratio": (null_ratios * 100).map("{:.2f}%".format)
})

# # Save to CSV
summary_df.to_csv("../../column_percentages/bal_portal_null_ratio_summary.csv", index=False)
summary_df.head()

Unnamed: 0,column,null_ratio
total_liabilities,total_liabilities,0.39%
retained_earnings_accumulated_deficit,retained_earnings_accumulated_deficit,0.39%
total_equity,total_equity,0.29%
registration_number,registration_number,0.00%
old_registration_number,old_registration_number,0.00%


In [88]:
# #total assets = 
# """
# 8,626.02  (Current Assets)
# + 792.50  (PPE Net)
# + 15.48   (Intangibles)
# + 17,812.08 (LT Investments)
# + 263.29  (Other LT Assets)
# = 27,509.37 ✅
# + Note Receivable - Long Term: 94.43
# → Total Assets = 27,603.81 ✔️
# """
# # So we need
# """
# [
#     "total_current_assets",
#     "property_plant_equipment_total_net",
#     "intangibles_net",
#     "long_term_investments",
#     "other_long_term_assets",
#     "note_receivable_long_term"
# ]
# """