<a href="https://colab.research.google.com/github/ananghw/project_simotandi/blob/main/WebScrapping_Simotandi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
%pip install tabula-py
%pip install JPype1
import pandas as pd
import tabula
from urllib.parse import urljoin
import os
base_url = 'https://simotandi.pertanian.go.id'
page_url = 'https://simotandi.pertanian.go.id/datatabular_prov.php?no=57'

output_folder = 'hasil_scraping'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

kolom_header = [
    'No',
    'Provinsi/Kabupaten/Kota',
    'Bera',
    'Penggenangan',
    'Tanam (1-15 HST)',
    'Vegetatif 1 (16-30 HST)',
    'Vegetatif 2 (31-40 HST)',
    'Max. Vegetatif (41-54 HST)',
    'Generatif 1 (55-71 HST)',
    'Generatif 2 (72-110 HST)',
    'Panen',
    'Standing Crop',
    'Luas Baku Sawah (Ha)'
]

print(f"Mengambil halaman dari: {page_url}")
try:
    response = requests.get(page_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    pdf_links = soup.find_all('a', href=lambda href: href and '.pdf' in href)

    if not pdf_links:
        print("Tidak ada link PDF yang ditemukan di halaman ini.")
    else:
        total_pdfs = len(pdf_links)
        print(f"Ditemukan {total_pdfs} link PDF. Memulai proses ekstraksi...")

        for i, link_tag in enumerate(pdf_links):
            print(f"\n--- Memproses PDF {i+1}/{total_pdfs} ---")

            relative_link = link_tag['href']
            full_pdf_url = urljoin(base_url, relative_link)
            print(f"URL PDF: {full_pdf_url}")

            pdf_filename = os.path.basename(relative_link).replace('.pdf', '')
            output_excel_file = os.path.join(output_folder, f'tabel_dari_{pdf_filename}.xlsx')

            try:
                print("Mengekstrak tabel dari PDF...")
                tables = tabula.read_pdf(full_pdf_url, pages='all', lattice=True, multiple_tables=True, pandas_options={'header': None})

                if not tables:
                    print("Tidak ada tabel yang bisa diekstrak dari PDF ini.")
                else:
                    print(f"Berhasil! Ditemukan {len(tables)} tabel. Membersihkan dan menggabungkan...")

                    cleaned_tables_list = []

                    for j, df in enumerate(tables):
                        if df.empty:
                            continue

                        start_row_index = pd.to_numeric(df.iloc[:, 0], errors='coerce').first_valid_index()

                        if start_row_index is not None:
                            df_cleaned = df.loc[start_row_index:].reset_index(drop=True)

                            if len(df_cleaned.columns) == len(kolom_header):
                                df_cleaned.columns = kolom_header
                                # Masukkan tabel yang sudah bersih ke dalam list
                                cleaned_tables_list.append(df_cleaned)
                            else:
                                print(f"  Peringatan: Jumlah kolom di Tabel_{j+1} tidak cocok dengan header. Dilewati.")
                        else:
                            print(f"  Peringatan: Tidak ditemukan baris data numerik di Tabel_{j+1}. Dilewati.")

                    if cleaned_tables_list:
                        combined_df = pd.concat(cleaned_tables_list, ignore_index=True)

                        combined_df.to_excel(output_excel_file, index=False)
                        print(f"Semua tabel telah digabung dan disimpan ke: {output_excel_file}")
                    else:
                        print("Tidak ada tabel yang valid untuk digabungkan setelah proses pembersihan.")

            except Exception as e:
                print(f"GAGAL memproses PDF ini. Error: {e}")

except requests.exceptions.RequestException as e:
    print(f"Gagal mengakses URL halaman web: {e}")

print("\n\n--- Proses Selesai ---")

Mengambil halaman dari: https://simotandi.pertanian.go.id/datatabular_prov.php?no=57
Ditemukan 34 link PDF. Memulai proses ekstraksi...

--- Memproses PDF 1/34 ---
URL PDF: https://simotandi.pertanian.go.id/pdf/571104b571b7a13c5a05b7d5694c1e8af306.pdf
Mengekstrak tabel dari PDF...
Berhasil! Ditemukan 7 tabel. Membersihkan dan menggabungkan...
Semua tabel telah digabung dan disimpan ke: hasil_scraping/tabel_dari_571104b571b7a13c5a05b7d5694c1e8af306.xlsx

--- Memproses PDF 2/34 ---
URL PDF: https://simotandi.pertanian.go.id/pdf/571227552720a7e5efdc4adea1338f805220.pdf
Mengekstrak tabel dari PDF...
Berhasil! Ditemukan 10 tabel. Membersihkan dan menggabungkan...
Semua tabel telah digabung dan disimpan ke: hasil_scraping/tabel_dari_571227552720a7e5efdc4adea1338f805220.xlsx

--- Memproses PDF 3/34 ---
URL PDF: https://simotandi.pertanian.go.id/pdf/571356fd3896aaec780f1275d599f5fc416f.pdf
Mengekstrak tabel dari PDF...
Berhasil! Ditemukan 5 tabel. Membersihkan dan menggabungkan...
Semua tabel 

In [2]:
#CHECKING THE SAMPLE XLSX
excel_file_path = "/content/hasil_scraping/tabel_dari_571104b571b7a13c5a05b7d5694c1e8af306.xlsx"
df_sample = pd.read_excel(excel_file_path)
display(df_sample.head())

print(f"Number of rows: {df_sample.shape[0]}")
print(f"Number of columns: {df_sample.shape[1]}")

Unnamed: 0,No,Provinsi/Kabupaten/Kota,Bera,Penggenangan,Tanam (1-15 HST),Vegetatif 1 (16-30 HST),Vegetatif 2 (31-40 HST),Max. Vegetatif (41-54 HST),Generatif 1 (55-71 HST),Generatif 2 (72-110 HST),Panen,Standing Crop,Luas Baku Sawah (Ha)
0,1,Aceh,74.988,16.344,18.0,15.951,12.391,18.769,17.068,13.864,26.228,96.043,214.729
1,2,Aceh Barat,3.643,1.192,932.0,672.0,329.0,746.0,785.0,477.0,1.348,3.941,10.192
2,3,Arongan Lambalek,375.0,109.0,28.0,23.0,22.0,138.0,215.0,50.0,217.0,476.0,1.188
3,4,Bubon,220.0,83.0,73.0,88.0,56.0,59.0,45.0,12.0,85.0,333.0,733.0
4,5,Johan Pahlawan,98.0,24.0,28.0,20.0,14.0,30.0,14.0,20.0,111.0,126.0,363.0


Number of rows: 313
Number of columns: 13


In [3]:
#COMBINE ALL THE DATA EXTRACTED
import glob

print("\nMenggabungkan seluruh file Excel menjadi satu file...")

# Cari semua file Excel di folder output
excel_files = glob.glob(os.path.join(output_folder, 'tabel_dari_*.xlsx'))

combined_all_df = []

for file in excel_files:
    try:
        df = pd.read_excel(file)
        df['Source File'] = os.path.basename(file)  # Tambahkan kolom sumber
        combined_all_df.append(df)
    except Exception as e:
        print(f"Gagal membaca {file}: {e}")

if combined_all_df:
    final_combined_df = pd.concat(combined_all_df, ignore_index=True)
    combined_output_file = os.path.join(output_folder, 'gabungan_semua_tabel.xlsx')
    final_combined_df.to_excel(combined_output_file, index=False)
    print(f"Berhasil menggabungkan semua file ke: {combined_output_file}")
else:
    print("Tidak ada file yang berhasil digabung.")

#CHECKING THE SAMPLE XLSX
excel_file_path = "/content/hasil_scraping/gabungan_semua_tabel.xlsx"
df_sample = pd.read_excel(excel_file_path)
display(df_sample.head())

print(f"Number of rows: {df_sample.shape[0]}")
print(f"Number of columns: {df_sample.shape[1]}")


Menggabungkan seluruh file Excel menjadi satu file...
Berhasil menggabungkan semua file ke: hasil_scraping/gabungan_semua_tabel.xlsx


Unnamed: 0,No,Provinsi/Kabupaten/Kota,Bera,Penggenangan,Tanam (1-15 HST),Vegetatif 1 (16-30 HST),Vegetatif 2 (31-40 HST),Max. Vegetatif (41-54 HST),Generatif 1 (55-71 HST),Generatif 2 (72-110 HST),Panen,Standing Crop,Luas Baku Sawah (Ha),Source File
0,1,Maluku Utara,4.49,700,810,826,711,1.153,940,789,2.982,5.229,13.508,tabel_dari_5782329bff59db4b644faac7068039cd858...
1,2,Halmahera Barat,434.0,36,62,47,43,98.0,142,62,174.0,454.0,1.124,tabel_dari_5782329bff59db4b644faac7068039cd858...
2,3,Ibu,20.0,2,10,8,4,3.0,7,3,2.0,35.0,61.0,tabel_dari_5782329bff59db4b644faac7068039cd858...
3,4,Ibu Selatan,47.0,2,3,4,1,3.0,7,6,32.0,24.0,110.0,tabel_dari_5782329bff59db4b644faac7068039cd858...
4,5,Jailolo,122.0,11,23,19,22,57.0,62,33,65.0,216.0,417.0,tabel_dari_5782329bff59db4b644faac7068039cd858...


Number of rows: 7541
Number of columns: 14


In [4]:
# prompt: save the gabungan_semua_tabel into google sheet

# Install gspread and google-auth
!pip install --upgrade gspread google-auth

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
import pandas as pd

# Authenticate and create a client
creds, _ = default()
gc = gspread.authorize(creds)

# Name of the sheet you want to save to
spreadsheet_name = 'Your Target Spreadsheet Name'
worksheet_name = 'Sheet1' # Or any other sheet name

try:
    # Try to open the spreadsheet by name
    spreadsheet = gc.open(spreadsheet_name)
except gspread.SpreadsheetNotFound:
    # If not found, create a new one
    print(f"Spreadsheet '{spreadsheet_name}' not found. Creating a new one...")
    spreadsheet = gc.create(spreadsheet_name)
    print(f"Spreadsheet '{spreadsheet_name}' created.")

# Select the worksheet. If it doesn't exist, create it (gspread v5+ handles this)
try:
    worksheet = spreadsheet.worksheet(worksheet_name)
except gspread.WorksheetNotFound:
    print(f"Worksheet '{worksheet_name}' not found. Creating a new one...")
    worksheet = spreadsheet.add_worksheet(title=worksheet_name, rows="100", cols="20") # Adjust rows/cols as needed
    print(f"Worksheet '{worksheet_name}' created.")


# Assuming 'final_combined_df' is the DataFrame you want to save
# If it's not already defined, you need to ensure the preceding code
# successfully creates the 'final_combined_df' DataFrame.

if 'final_combined_df' in locals():
    print(f"Saving data to Google Sheet '{spreadsheet_name}' in worksheet '{worksheet_name}'...")

    # Clear existing data in the sheet
    worksheet.clear()

    # Convert DataFrame to a list of lists (including header)
    data_to_upload = [final_combined_df.columns.values.tolist()] + final_combined_df.values.tolist()

    # Upload the data
    worksheet.update(values=data_to_upload)

    print("Data successfully uploaded to Google Sheet.")
else:
    print("Error: 'final_combined_df' DataFrame was not created. Please ensure the preceding code ran successfully.")



Saving data to Google Sheet 'Your Target Spreadsheet Name' in worksheet 'Sheet1'...
Data successfully uploaded to Google Sheet.
