In [1]:
# !pip install requests beautifulsoup4 pandas

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_year(year):
    url = f"https://bepi.mpob.gov.my/admin2/price_local_daily_view_cpo_msia.php?more=Y&jenis=1Y&tahun={year}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    table = soup.find("tbody")
    rows = table.find_all("tr")
    
    # Extract month headers
    header_row = None
    for row in rows:
        if row.find_all("td") and "Date" in row.get_text():
            header_row = row
            break

    if not header_row:
        raise ValueError(f"Month headers not found for year {year}")
    
    months = [td.get_text(strip=True) for td in header_row.find_all("td")[1:]]
    
    data = []
    
    for row in rows:
        cols = row.find_all("td")
        if len(cols) == 13 and cols[0].get_text(strip=True).isdigit():
            day = cols[0].get_text(strip=True).zfill(2)
            for i, value_td in enumerate(cols[1:], start=0):
                value = value_td.get_text(strip=True).replace(",", "")
                if value not in ['-', '']:
                    data.append({
                        "year": year,
                        "month": months[i],
                        "day": day,
                        "price": value
                    })
    
    return pd.DataFrame(data)

# Combine data for 2023, 2024, 2025
years = [2023, 2024, 2025]
all_data = pd.concat([scrape_year(year) for year in years], ignore_index=True)

# Format date
all_data["date"] = pd.to_datetime(all_data["day"] + " " + all_data["month"] + " " + all_data["year"].astype(str), errors='coerce', dayfirst=True)
all_data = all_data.dropna(subset=["date"])

# Final formatting
all_data = all_data[["date", "price"]].sort_values("date")
all_data["price"] = all_data["price"].apply(lambda x: x if x in ["PH", "NT"] else float(x))

# Export to CSV
all_data.to_csv("cpo_daily_prices.csv", index=False)

print("✅ Scraping complete. Data saved to 'cpo_daily_prices.csv'.")


✅ Scraping complete. Data saved to 'cpo_daily_prices.csv'.


Crude Palm Oil (Local Delivered)<br>
Minyak Sawit Mentah (Hantaran Tempatan)

Note : <br>
     1) All prices are weighted average / Semua harga dalam purata wajaran <br>
     2) Price to be revised accordingly at 8.30am & 4.30pm / Harga akan dikemaskini pada jam 8.30 pagi & 4.30 petang. <br>

Legend : <br>
         ** - Price as at 8.30 AM / Harga adalah sehingga 8.30 pagi.<br>
         NT - No Trade / Tiada Urusniaga<br>
         PH : Public Holiday / Cuti Am<br>
<br>
Last update : 1/08/2025   4.30 pm


In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import os

def scrape_today(year):
    url = f"https://bepi.mpob.gov.my/admin2/price_local_daily_view_cpo_msia.php?more=Y&jenis=1Y&tahun={year}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    table = soup.find("tbody")
    rows = table.find_all("tr")

    months = []
    for row in rows:
        if row.find_all("td") and "Date" in row.get_text():
            months = [td.get_text(strip=True) for td in row.find_all("td")[1:]]
            break

    data = []
    for row in rows:
        cols = row.find_all("td")
        if len(cols) == 13 and cols[0].get_text(strip=True).isdigit():
            day = cols[0].get_text(strip=True).zfill(2)
            for i, value_td in enumerate(cols[1:], start=0):
                value = value_td.get_text(strip=True).replace(",", "")
                if value not in ['-', '']:
                    record = {
                        "year": year,
                        "month": months[i],
                        "day": day,
                        "price": value
                    }
                    data.append(record)

    df = pd.DataFrame(data)
    df["date"] = pd.to_datetime(df["day"] + " " + df["month"] + " " + df["year"].astype(str), errors='coerce', dayfirst=True)
    df = df.dropna(subset=["date"])
    df["price"] = df["price"].apply(lambda x: x if x in ["PH", "NT"] else float(x))
    return df[["date", "price"]]

def prices_are_equal(price1, price2):
    """Compare two prices, handling different data types properly"""
    # Convert both to string for comparison to handle mixed types
    return str(price1) == str(price2)

def update_csv():
    if datetime.now().hour < 10:
        return  # Run only after 10 AM

    year = datetime.now().year
    df_scraped = scrape_today(year)
    csv_file = "cpo_daily_prices.csv"

    if os.path.exists(csv_file):
        df_existing = pd.read_csv(csv_file, parse_dates=["date"])
    else:
        df_existing = pd.DataFrame(columns=["date", "price"])

    df_existing.set_index("date", inplace=True)
    df_scraped.set_index("date", inplace=True)

    changes = []

    # Detect updates (date exists and price changed)
    common_dates = df_scraped.index.intersection(df_existing.index)
    for date in common_dates:
        new_price = df_scraped.loc[date, "price"]
        old_price = df_existing.loc[date, "price"]
        # Only add to changes if prices are actually different
        if not prices_are_equal(old_price, new_price):
            df_existing.loc[date, "price"] = new_price
            changes.append((date.date(), old_price, new_price))

    # Detect new additions
    new_dates = df_scraped.index.difference(df_existing.index)
    for date in new_dates:
        new_price = df_scraped.loc[date, "price"]
        df_existing.loc[date] = new_price
        changes.append((date.date(), None, new_price))

    # Save updated file
    df_existing.sort_index().reset_index().to_csv(csv_file, index=False)

    # Print only actual changes
    for date, old, new in changes:
        print(f"{date} updated: {old} → {new}")

# Run it
update_csv()

2025-07-15 updated: 126.0 → 4126.0
