## CRAWLING DATA HARGA KONSUMEN KABUPATEN PACITAN
> Penarikan data Harga Konsumen (Area) pada 1 tahun terakhir menggunakan metode MultiThread

Data di ambil dari  ```https://siskaperbapo.jatimprov.go.id/harga/tabel.nodesign/``` <br>

Penarikan data menggunakan Method POST<br>
dengan menggunakan Payload/data 
```
{
    tanggal: 2024-04-30
    kabkota: pacitankab
    pasar: 
}
```



In [1]:
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
import requests
import pandas as pd
from datetime import datetime, timedelta
import pyarrow as pa
import pyarrow.parquet as pq

class MultiThreadedCrawler:
    def __init__(self, seed_url):
        self.seed_url = seed_url
        self.root_url = '{}://{}'.format(urlparse(self.seed_url).scheme, urlparse(self.seed_url).netloc)
        self.pool = ThreadPoolExecutor(max_workers=5)
        self.scraped_pages = set([])
        self.crawl_queue = Queue()
        self.data = []  # Initialize an empty list to store scraped data

    def parse_links(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        Anchor_Tags = soup.find_all('a', href=True)
        for link in Anchor_Tags:
            url = link['href']
            try:
                res = requests.get(url, timeout=(3, 30))
                return res
            except requests.RequestException:
                return

    def scrape_page(self, url, tanggal):
        try:
            res = requests.post(urljoin(self.seed_url, url), data={'tanggal': tanggal, 'kabkota': 'pacitankab', 'pasar': ''})
            return res, tanggal
        except requests.RequestException as e:
            print("Request failed:", e)
            return None, tanggal

    def post_scrape_callback(self, res, tanggal):
        try:
            if res:
                soup = BeautifulSoup(res.text, 'html.parser')
                table = soup.find('table', {'class': 'table table-bordered table-hover table-condensed'})
                if table:
                    header = [th.text.strip() for th in table.find_all('th')]
                    data = []
                    for row in table.find_all('tr')[1:]:
                        cells = [td.text.strip() for td in row.find_all('td')]
                        if cells:
                            cells.append(tanggal)  # Tambahkan tanggal ke setiap baris data
                            data.append(cells)
                    df = pd.DataFrame(data, columns=header + ['Tanggal'])
                    df['NO'] = df['NO'].ffill()   # Fill empty 'NO' cells with previous non-empty value
                    self.data.append(df)  # Append the DataFrame to the list
        except Exception as e:
            print("Error during callback:", e)

    def run_web_crawler(self):
        today = datetime.now()
        for i in range(365):
            tanggal = (today - timedelta(days=i)).strftime('%Y-%m-%d')  # Perulangan mundur
            target_url = '/harga/tabel.nodesign/'
            try:
                res, tanggal = self.scrape_page(target_url, tanggal)
                self.post_scrape_callback(res, tanggal)
            except requests.RequestException as e:
                print("Request failed:", e)

    def display_data(self):
        if self.data:
            today = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print(f"Harga Rata-Rata Kabupaten Pacitan di Tingkat Konsumen")
            print("Pasar : Pasar Minulyo, Pasar Arjowinangun, Pasar Arjosari")
            for df in self.data:
                print(f"Tanggal: {df['Tanggal'].iloc[0]}")
                print(df.drop(columns=['Tanggal']).to_string(index=False, justify='center', col_space=2, line_width=120))
                print("\n---\n")  # Pisahkan setiap hasil dengan tulisan ---
            # Simpan data ke file Parquet
            self.save_to_parquet()

    def save_to_parquet(self):
        if self.data:
            # Gabungkan semua data dalam satu DataFrame
            concatenated_df = pd.concat(self.data, ignore_index=True)
            # Simpan DataFrame ke file Parquet
            file_path = '../Output_Crawling/output_harga_kabPacitan.parquet'
            pq.write_table(pa.Table.from_pandas(concatenated_df), file_path)


# Menggunakan kelas MultiThreadedCrawler dengan URL seed yang diinginkan
if __name__ == "__main__":
    crawler = MultiThreadedCrawler('https://siskaperbapo.jatimprov.go.id/harga/')
    crawler.run_web_crawler()
    crawler.display_data()  # Call the display_data method to print the scraped data


Harga Rata-Rata Kabupaten Pacitan di Tingkat Konsumen
Pasar : Pasar Minulyo, Pasar Arjowinangun, Pasar Arjosari
Tanggal: 2024-04-30
NO           NAMA BAHAN POKOK             SATUAN   HARGA KEMARIN HARGA SEKARANG PERUBAHAN (Rp) PERUBAHAN (%)
                        - Beras Premium         kg     14.500        14.500              0          0,00%   
                         - Beras Medium         kg     12.267        12.267              0          0,00%   
02                                 GULA                                                                     
                   - Gula Kristal Putih         kg     18.333        18.333              0          0,00%   
03                        MINYAK GORENG                                                                     
                  - Minyak Goreng Curah         kg     17.000        17.000              0          0,00%   
        - Minyak Goreng Kemasan Premium    1 liter     18.167        18.167              0          0,00%