In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import pyarrow 

def get_data(tanggal):
    url = "https://siskaperbapo.jatimprov.go.id/harga/tabel.nodesign/"
    payload = {
        "tanggal": tanggal,
        "kabkota": "probolinggokab",
        "pasar": ""
    }

    response = requests.post(url, data=payload)
    
    if response.status_code == 200:
        return response.content
    else:
        print(f"Failed to fetch data for date: {tanggal}")
        return None

def parse_data(html_content, date):
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')

    headers = []
    rows = []

    for th in table.find_all('th'):
        headers.append(th.text.strip())
    
    rows.append(["Tanggal", date.strftime("%Y-%m-%d"), "", "", "", "", ""])
    rows.append(["01", "BERAS", "", "", "", "", ""])

    for tr in table.find_all('tr')[1:]:
        row = []
        for td in tr.find_all('td'):
            row.append(td.text.strip())
        rows.append(row)

    return headers, rows

def create_dataframe(headers, rows):
    df = pd.DataFrame(rows, columns=headers)
    return df

def process_date(date):
    html_content = get_data(date.strftime("%Y-%m-%d"))
    if html_content:
        headers, rows = parse_data(html_content, date)
        df = create_dataframe(headers, rows)
        return df

start_date = datetime(2023, 4, 24)
end_date = datetime(2023, 5, 24)

all_data = pd.DataFrame()
with ThreadPoolExecutor() as executor:
    futures = []
    for current_date in [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]:
        futures.append(executor.submit(process_date, current_date))
    
    for future in futures:
        df = future.result()
        if df is not None:
            all_data = pd.concat([all_data, df])

all_data.reset_index(drop=True, inplace=True)
all_data.index += 1
all_data.index.name = "No"

start_date_formatted = start_date.strftime('%Y-%m-%d')
end_date_formatted = end_date.strftime('%Y-%m-%d')
print(f"Harga Rata-Rata Kabupaten Probolinggo di Tingkat Konsumen Tanggal {start_date_formatted} s/d {end_date_formatted}")
print("Pasar: Pasar Dringu, Pasar Leces, Pasar Semampir")
print()
print(all_data)

output_file = "hasil_probolinggo_24april-24mei.parquet"
all_data.to_parquet(output_file)


Harga Rata-Rata Kabupaten Probolinggo di Tingkat Konsumen Tanggal 2023-04-24 s/d 2023-05-24
Pasar: Pasar Dringu, Pasar Leces, Pasar Semampir

           NO           NAMA BAHAN POKOK SATUAN HARGA KEMARIN HARGA SEKARANG  \
No                                                                             
1     Tanggal                 2023-04-24                                       
2          01                      BERAS                                       
3                        - Beras Premium     kg        11.967         11.967   
4                         - Beras Medium     kg        10.567         10.567   
5          02                       GULA                                       
...       ...                        ...    ...           ...            ...   
2724             - Pupuk KCL Non Subsidi     Kg         7.800          7.800   
2725             - Pupuk NPK Non Subsidi     Kg        12.167         12.167   
2726           - Pupuk SP 35 Non Subsidi     Kg         5.

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor
import pyarrow 

def get_data(tanggal):
    url = "https://siskaperbapo.jatimprov.go.id/harga/tabel.nodesign/"
    payload = {
        "tanggal": tanggal,
        "kabkota": "probolinggokab",
        "pasar": ""
    }

    response = requests.post(url, data=payload)
    
    if response.status_code == 200:
        return response.content
    else:
        print(f"Failed to fetch data for date: {tanggal}")
        return None

def parse_data(html_content, date):
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')

    headers = []
    rows = []

    for th in table.find_all('th'):
        headers.append(th.text.strip())
    
    rows.append(["Tanggal", date.strftime("%Y-%m-%d"), "", "", "", "", ""])
    rows.append(["01", "BERAS", "", "", "", "", ""])

    for tr in table.find_all('tr')[1:]:
        row = []
        for td in tr.find_all('td'):
            row.append(td.text.strip())
        rows.append(row)

    return headers, rows

def create_dataframe(headers, rows):
    df = pd.DataFrame(rows, columns=headers)
    return df

def process_date(date):
    html_content = get_data(date.strftime("%Y-%m-%d"))
    if html_content:
        headers, rows = parse_data(html_content, date)
        df = create_dataframe(headers, rows)
        return df

start_date = datetime(2023, 4, 24)
end_date = datetime(2023, 5, 24)

all_data = pd.DataFrame()
with ThreadPoolExecutor() as executor:
    futures = []
    for current_date in [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]:
        futures.append(executor.submit(process_date, current_date))
    
    for future in futures:
        df = future.result()
        if df is not None:
            all_data = pd.concat([all_data, df])

all_data.reset_index(drop=True, inplace=True)
all_data.index += 1
all_data.index.name = "No"

start_date_formatted = start_date.strftime('%Y-%m-%d')
end_date_formatted = end_date.strftime('%Y-%m-%d')
print(f"Harga Rata-Rata Kabupaten Probolinggo di Tingkat Konsumen Tanggal {start_date_formatted} s/d {end_date_formatted}")
print("Pasar: Pasar Dringu, Pasar Leces, Pasar Semampir")
print()
print(all_data)

output_file = "hasil_probolinggo_24april-24mei.parquet"
all_data.to_parquet(output_file)


KeyboardInterrupt: 