In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import concurrent.futures
import pyarrow as pa
import pyarrow.parquet as pq

# URL for the request
url = 'https://siskaperbapo.jatimprov.go.id/harga/tabel.nodesign/'

# Function to format date
def format_date(date):
    return date.strftime("%Y-%m-%d")

# Function to make POST request and extract data
def get_data_for_date(date):
    payload = {
        'tanggal': format_date(date),
        'kabkota': 'surabayakota',
        'pasar': ''
    }
    response = requests.post(url, data=payload)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table')
    if table:
        data = []
        no = 1
        for row in table.find_all('tr'):
            cols = row.find_all('td')
            cols = [col.text.strip() for col in cols]
            if cols:
                if len(cols) < 7:  # If the row doesn't have enough columns, add empty values
                    cols = [''] * (7 - len(cols)) + cols
                cols[0] = str(no)  # Set the value for the 'NO' column
                no += 1
                data.append(cols)
        return data
    else:
        return []

# Start and end dates for the range
start_date = datetime(2023, 4, 24)
end_date = datetime(2024, 4, 24)

# Data container
all_data = []

# Function to fetch data for a single date and append to all_data
def fetch_data_for_date(date):
    data_for_date = get_data_for_date(date)
    if data_for_date:
        all_data.extend(data_for_date)

# Use ThreadPoolExecutor to perform requests in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit tasks for each date in the range
    date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]
    executor.map(fetch_data_for_date, date_range)

# Convert data into DataFrame, excluding the title row
df = pd.DataFrame(all_data[1:], columns=all_data[0])

# Add header manually
header = ['NO', 'NAMA BAHAN POKOK', 'SATUAN', 'HARGA KEMARIN', 'HARGA SEKARANG', 'PERUBAHAN (Rp)', 'PERUBAHAN (%)']
df.columns = header

# Save DataFrame as Parquet
parquet_file = 'harga_surabaya.parquet'
table = pa.Table.from_pandas(df)
pq.write_table(table, parquet_file)

# Add header for the table
table_header = "Harga Rata-Rata Kota Suarabaya di Tingkat Konsumen Tanggal 24 April 2023- 24 April 2024\n"
table_header += "NO\tNAMA BAHAN POKOK\tSATUAN\tHARGA KEMARIN\tHARGA SEKARANG\tPERUBAHAN (Rp)\tPERUBAHAN (%)\n"
print(table_header)

# Display the DataFrame
print(df.to_string(header=False, index=False, justify='left'))

print("\nJumlah baris yang berhasil ditarik:", len(df))
print("Jumlah kolom yang berhasil ditarik:", len(df.columns))


Harga Rata-Rata Kota Suarabaya di Tingkat Konsumen Tanggal 24 April 2023- 24 April 2024
NO	NAMA BAHAN POKOK	SATUAN	HARGA KEMARIN	HARGA SEKARANG	PERUBAHAN (Rp)	PERUBAHAN (%)

 2                      - Beras Premium         kg  13.208  13.375     167   1,26%
 3                       - Beras Medium         kg  11.617  11.617       0   0,00%
 4                                 GULA                                           
 5                 - Gula Kristal Putih         kg  13.167  13.083     -83  -0,63%
 6                        MINYAK GORENG                                           
 7                - Minyak Goreng Curah         kg  16.000  15.833    -167  -1,04%
 8      - Minyak Goreng Kemasan Premium    1 liter  19.250  19.667     417   2,16%
 9    - Minyak Goreng Kemasan Sederhana    1 Liter  15.000  15.000       0   0,00%
10            - Minyak Goreng MINYAKITA    1 Liter  14.000  14.000       0   0,00%
11                               DAGING                                        