In [1]:
#downloaded pdf file on 30 June 2023

#to download all pdf file in the government websites url = "https://kkp.go.id/djprl/kkhl/page/2107-sk-penetapan"
#where the management plan of MPAs in Indonesia are uploaded and available publicly
 

#!/usr/bin/env python
# coding: utf-8

# In[4]:


import os #provides a way to use operating system-dependent functionality, such as working with files and directories.

#This defines a function named extract_url_pdf that takes two arguments: input_url (the URL from which to extract PDF links) and folder_path (the path where the downloaded PDFs and Excel file will be stored). The os.getcwd() function provides the current working directory as a default value for folder_path if it's not provided when the function is called
def extract_url_pdf(input_url,folder_path=os.getcwd()):
    
    import os  
    import requests #module to allow the script to send HTTP requests
    from urllib.parse import urljoin #joiing URLs
    from bs4 import BeautifulSoup #a library for parsing HTML ad XML documents
    import pandas as pd #library for data manipulation and analysis
    import datetime #a module for working with dates and times
    
    url = "https://kkp.go.id/djprl/kkhl/page/2107-sk-penetapan" #the URL that will be scrapted for PDF links

    #If there is no such folder, the script will create one automatically
    #where the downloaded PDFs and excel file will be stored and if the specified folder does not exist, it's created using os.mkdir().
    folder_location = "YourDirectoryPath/Desktop/Analysis_ch1/Automated_MPAMP" 
    if not os.path.exists(folder_location):os.mkdir(folder_location)

    response = requests.get(url)
    soup= BeautifulSoup(response.text, "html.parser") #extract links and other information from the webpage

    #These lines initialize empty lists for storing the text, href attributes, and filenames of the PDF links. counter is used to keep track of the number of files extracted.
    link_text=list()
    link_href=list()
    link_file=list()
    counter=0

    #This starts a loop that iterates over all the <a> elements with an href attribute ending in .pdf in the parsed HTML.
    for link in soup.select("a[href$='.pdf']"):
        #Name the pdf files using the last portion of each link which are unique in this case
        
        #For each PDF link, this code constructs a local filename using the last part of the link's URL, then opens that file and writes the content of the PDF obtained by sending a GET request to the combined URL using requests.get(). The urljoin() function ensures the proper combination of base URL and relative link.
        filename = os.path.join(folder_location,link['href'].split('/')[-1])
        with open(filename, 'wb') as f:
            f.write(requests.get(urljoin(url,link['href'])).content)
            
        #These lines collect information about the extracted link: the text displayed for the link, the full href attribute, and the filename. The counter is incremented, and a message is printed indicating the number and filename of the extracted file.
        link_text.append(str(link.text))
        link_href.append(link['href'])
        link_file.append(link['href'].split('/')[-1])
        counter+=1
        print(counter, "-Files Extracted from URL named ",link['href'].split('/')[-1])
        
    #Here, a dictionary table_dict is created using the collected link information, and then a pandas DataFrame df is created from this dictionary. The DataFrame will hold the extracted data.
    table_dict={"Text":link_text,"Url_Link":link_href,"File Name":link_file}
    df=pd.DataFrame(table_dict)
    
    #This gets the current date and time as a timestamp using the datetime module.
    time_stamp = datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')
    
    print("Creating an Excel file with Name of FIle, Url Link and Link Text...")
    
    #This constructs the filename for the Excel output file using the timestamp and folder location.
    new_excel_file=os.path.join(folder_location,"Excel_Output_"+time_stamp+".xlsx")

    #These lines create an Excel writer using pd.ExcelWriter(), then write the DataFrame df to an Excel sheet named "Output". Finally, the changes are saved using writer.save().
    writer = pd.ExcelWriter(new_excel_file, engine='openpyxl')
    df.to_excel(writer,sheet_name="Output")
    writer.save()
    
    #This line indicates that the script has completed its tasks.
    print("All Pdf files downloaded and Excel File Created")

1 -Files Extracted from URL named  5307b-2022kepmen-kp31.pdf
2 -Files Extracted from URL named  KEPMENKP%202014%2038%20RPZ%20Pieh.pdf
3 -Files Extracted from URL named  Dokumen%20Final%20Pieh.pdf
4 -Files Extracted from URL named  bb5a0-2022kepmen-kp30.pdf
5 -Files Extracted from URL named  KEPMENKP%202014%2053%20RPZ%20Anambas.pdf
6 -Files Extracted from URL named  Dokumen%20Final%20Anambas.pdf
7 -Files Extracted from URL named  27224-2022kepmen-kp34.pdf
8 -Files Extracted from URL named  KEPMENKP%202014%2057%20RPZ%20Gili%20Matra.pdf
9 -Files Extracted from URL named  Dokumen%20Final%20Gili%20Matra.pdf
10 -Files Extracted from URL named  KEPMENKP%202014%205%20Sawu.pdf
11 -Files Extracted from URL named  KEPMENKP%202014%206%20RPZ%20Laut%20Sawu.pdf
12 -Files Extracted from URL named  62100-2022kepmen-kp35.pdf
13 -Files Extracted from URL named  KEPMENKP%202014%2059%20RPZ%20Kapoposang.pdf
14 -Files Extracted from URL named  Dokumen%20Final%20Kapoposang.pdf
15 -Files Extracted from URL nam

115 -Files Extracted from URL named  RPZ%20Lembata%20NTT.pdf
116 -Files Extracted from URL named  352c3-kepmen-nomor-96-tahun-2021.pdf
117 -Files Extracted from URL named  RPZ%20Sikka%20NTT.pdf
118 -Files Extracted from URL named  7d7d6-kepmen-89-tahun-2020-edit-hm-.pdf
119 -Files Extracted from URL named  RPZ%20Kayong%20Utara.pdf
120 -Files Extracted from URL named  4db65-kepmen-91-tahun-2020-edit-hm-.pdf
121 -Files Extracted from URL named  RPZ%20Kendawangan.pdf
122 -Files Extracted from URL named  4cff1-kepmen-90-tahun-2020-edit-hm-.pdf
123 -Files Extracted from URL named  RPZ%20Pulau%20Randayan%20Bengkayang.pdf
124 -Files Extracted from URL named  d0235-kepmen-92-tahun-2020-edit-hm-.pdf
125 -Files Extracted from URL named  RPZ%20Kubu%20Raya.pdf
126 -Files Extracted from URL named  4ae14-kepmen-93-tahun-2020-edit-hm-.pdf
127 -Files Extracted from URL named  RPZ%20Paloh.pdf
128 -Files Extracted from URL named  4d2b0-24-kepmen-kp-2019-ttg-kawasan-koservasi-gosong-senggora....pdf
129 -

27 -Files Extracted from URL named  05455-2022kepmen-kp36.pdf
28 -Files Extracted from URL named  KEPMENKP%202014%2062%20RPZ%20Padaido.pdf
29 -Files Extracted from URL named  Dokumen%20Final%20Padaido.pdf
30 -Files Extracted from URL named  KEPMENKP%202022%2049%20KK%20Mahakam.pdf
31 -Files Extracted from URL named  DF%20Mahakam.pdf
32 -Files Extracted from URL named  3e8cb-57-kepmen-kp-2013-ttg-kawasan-konservasi-perairan-pesisir.......pdf
33 -Files Extracted from URL named  Dokumen%20RPZ%20KKD%20Pulau%20Weh.pdf
34 -Files Extracted from URL named  dcbaa-76-kepmen-kp-2020-ttg-kkpd-tp-aceh-jaya-edit-hm-10-juni-2020-.pdf
35 -Files Extracted from URL named  RPZ%20Aceh%20Jaya.pdf
36 -Files Extracted from URL named  2b4da-77-kepmen-kp-2020-ttg-kkpd-tp-aceh-tamiang-edit-hm-3-juni-2020-otentifikasi.pdf
37 -Files Extracted from URL named  RPZ%20Aceh%20Tamiang.pdf
38 -Files Extracted from URL named  e63e8-78-kepmen-kp-2020-oke.pdf
39 -Files Extracted from URL named  RPZ%20Aceh%20Besar.pdf
40 -Fi

140 -Files Extracted from URL named  KEPMENKP%202019%2057%20KKD%20Tatoareng%20(Sulut).pdf
141 -Files Extracted from URL named  RPZ%20KKD%20Tatoareng%20(Sulut).pdf
142 -Files Extracted from URL named  DF%20KK%20Teluk%20Gorontalo.pdf
143 -Files Extracted from URL named  e4a95-50-kepmen-kp-2019-ttg-kaw.konservasi-donggala-buol.....pdf
144 -Files Extracted from URL named  RPZ%20Doboto%20(Sulteng).pdf
145 -Files Extracted from URL named  054f7-51-kepmen-kp-2019-ttg-kaw.konservasi-parigi-moutong-poso.....pdf
146 -Files Extracted from URL named  RPZ%20Parigi%20Muotong.pdf
147 -Files Extracted from URL named  21ab9-52-kepmen-kp-2019-ttg-kaw.konservasi-morowali-morowali-utara.....pdf
148 -Files Extracted from URL named  RPZ%20Morowali.pdf
149 -Files Extracted from URL named  83936-53-kepmen-kp-2019-ttg-kaw-konservasi-banggai-banggai-laut.....pdf
150 -Files Extracted from URL named  RPZ%20Banggai%20Dalaka.pdf
151 -Files Extracted from URL named  720d0-22-kepmen-kp-2021.pdf
152 -Files Extracted f

In [1]:
#to check readibility of downloaded pdf/MPA Management plans 

#These lines import the warnings module and then use warnings.filterwarnings() to suppress user warnings specifically from the "PyPDF2" module. This is done to prevent warning messages from being displayed during the execution of the code.
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="PyPDF2")

#importing two modules with operating system functionality and working with PDF files
import os
import PyPDF2

#This block defines a function named check_pdf_readability that takes a file_path as an argument.
def check_pdf_readability(file_path):
    #A try block attempts to open the PDF file in binary ('rb') mode using the provided file_path.
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfFileReader(file) #to read the PDF content
            num_pages = reader.numPages #the number of pages in the PDF
            return num_pages > 0  # If the number of pages is greater than 0, the file is readable
    except Exception:
        return False  # If an error occurs while reading the file, it is not readable

def check_all_pdfs_readability(folder_path):
    #containing filenames of all files in the specified folder that have a lowercase extension of.'pdf'
    pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
    
    #a loop iterates through each PDF file in 'pdf.files'
    for pdf_file in pdf_files:
        file_path = os.path.join(folder_path, pdf_file)
        is_readable = check_pdf_readability(file_path)
        
        if is_readable:
            print(f"The PDF file '{pdf_file}' is readable.")
        else:
            print(f"The PDF file '{pdf_file}' is not readable or an error occurred while reading.")

#directory
folder_path = "YourDirectoryPath/Desktop/Analysis_ch1/Automated_MPAMP"
check_all_pdfs_readability(folder_path)

The PDF file '02c77-48-kepmen-kp-2021.pdf' is not readable or an error occurred while reading.
The PDF file '05455-2022kepmen-kp36.pdf' is not readable or an error occurred while reading.
The PDF file '054f7-51-kepmen-kp-2019-ttg-kaw.konservasi-parigi-moutong-poso.....pdf' is not readable or an error occurred while reading.
The PDF file '05533-kep-29-men-2012-ttg-penetapan-kawasan-konservasi-ujungnegoro.pdf' is not readable or an error occurred while reading.
The PDF file '05e15-79-kepmen-kp-2020-ttg-teluk-berau-teluk-nusalasi-papua-barat-20-juli-2020.pdf' is not readable or an error occurred while reading.
The PDF file '0d9fc-kepmen-nomor-94-tahun-2021.pdf' is not readable or an error occurred while reading.
The PDF file '0f3b6-20-kepmen-kp-2020.pdf' is not readable or an error occurred while reading.
The PDF file '16b13-kepmen-86-tahun-2020-tentang-kawasan-konservasi-sumut-edit-hm-.pdf' is not readable or an error occurred while reading.
The PDF file '21ab9-52-kepmen-kp-2019-ttg-kaw.

The PDF file 'DF%20KK%20Perlang%20Ketugar%20Babel.pdf' is readable.
The PDF file 'DF%20KK%20Pulau%20Panjang.pdf' is readable.
The PDF file 'DF%20KK%20Teluk%20Gorontalo.pdf' is readable.
The PDF file 'DF%20KK%20Tuing%20Babel.pdf' is readable.
The PDF file 'DF%20KK%20Way%20Kambas.pdf' is readable.
The PDF file 'DF%20Lipan%20Rakit.pdf' is readable.
The PDF file 'DF%20Liukang%20Tangaya.pdf' is readable.
The PDF file 'DF%20Liukang%20Tupabbiring.pdf' is readable.
The PDF file 'DF%20Mahakam.pdf' is readable.
The PDF file 'DF%20Ngambur.pdf' is readable.
The PDF file 'DF%20PULAU%20PANJANG%20JEPARA.pdf' is readable.
The PDF file 'Dokumen%20Final%20Anambas.pdf' is readable.
The PDF file 'Dokumen%20Final%20Aru.pdf' is readable.
The PDF file 'Dokumen%20Final%20Gili%20Matra.pdf' is readable.
The PDF file 'Dokumen%20Final%20Kapoposang.pdf' is readable.
The PDF file 'Dokumen%20Final%20Laut%20Banda.pdf' is readable.
The PDF file 'Dokumen%20Final%20Padaido.pdf' is readable.
The PDF file 'Dokumen%20Final

In [7]:
#or we can also try the code below, with additional information about the total number of 
#PDF files, the number of readable PDF files, and the number of non-readable PDF files.

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="PyPDF2")

import os
import PyPDF2

def check_pdf_readability(file_path):
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfFileReader(file)
            num_pages = reader.numPages
            return num_pages > 0  # If the number of pages is greater than 0, the file is readable
    except Exception:
        return False  # If an error occurs while reading the file, it is not readable

def check_all_pdfs_readability(folder_path):
    pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
    num_readable = 0
    num_non_readable = 0

    for pdf_file in pdf_files:
        file_path = os.path.join(folder_path, pdf_file)
        is_readable = check_pdf_readability(file_path)
        
        if is_readable:
            num_readable += 1
            print(f"The PDF file '{pdf_file}' is readable.")
        else:
            num_non_readable += 1
            print(f"The PDF file '{pdf_file}' is not readable or an error occurred while reading.")

    print(f"Total PDF files: {len(pdf_files)}")
    print(f"Readable PDF files: {num_readable}")
    print(f"Non-readable PDF files: {num_non_readable}")

# Example usage:
folder_path = "YourDirectoryPath/Desktop/Analysis_ch1/Automated_MPAMP"
check_all_pdfs_readability(folder_path)

The PDF file '02c77-48-kepmen-kp-2021.pdf' is not readable or an error occurred while reading.
The PDF file '05455-2022kepmen-kp36.pdf' is not readable or an error occurred while reading.
The PDF file '054f7-51-kepmen-kp-2019-ttg-kaw.konservasi-parigi-moutong-poso.....pdf' is not readable or an error occurred while reading.
The PDF file '05533-kep-29-men-2012-ttg-penetapan-kawasan-konservasi-ujungnegoro.pdf' is not readable or an error occurred while reading.
The PDF file '05e15-79-kepmen-kp-2020-ttg-teluk-berau-teluk-nusalasi-papua-barat-20-juli-2020.pdf' is not readable or an error occurred while reading.
The PDF file '0d9fc-kepmen-nomor-94-tahun-2021.pdf' is not readable or an error occurred while reading.
The PDF file '0f3b6-20-kepmen-kp-2020.pdf' is not readable or an error occurred while reading.
The PDF file '16b13-kepmen-86-tahun-2020-tentang-kawasan-konservasi-sumut-edit-hm-.pdf' is not readable or an error occurred while reading.
The PDF file '21ab9-52-kepmen-kp-2019-ttg-kaw.

The PDF file 'DF%20Lipan%20Rakit.pdf' is readable.
The PDF file 'DF%20Liukang%20Tangaya.pdf' is readable.
The PDF file 'DF%20Liukang%20Tupabbiring.pdf' is readable.
The PDF file 'DF%20Mahakam.pdf' is readable.
The PDF file 'DF%20Ngambur.pdf' is readable.
The PDF file 'DF%20PULAU%20PANJANG%20JEPARA.pdf' is readable.
The PDF file 'Dokumen%20Final%20Anambas.pdf' is readable.
The PDF file 'Dokumen%20Final%20Aru.pdf' is readable.
The PDF file 'Dokumen%20Final%20Gili%20Matra.pdf' is readable.
The PDF file 'Dokumen%20Final%20Kapoposang.pdf' is readable.
The PDF file 'Dokumen%20Final%20Laut%20Banda.pdf' is readable.
The PDF file 'Dokumen%20Final%20Padaido.pdf' is readable.
The PDF file 'Dokumen%20Final%20Pieh.pdf' is readable.
The PDF file 'Dokumen%20Final%20Raja%20Ampat.pdf' is readable.
The PDF file 'Dokumen%20Final%20Waigeo.pdf' is readable.
The PDF file 'Dokumen%20RPZ%20KKD%20Pulau%20Weh.pdf' is readable.
The PDF file 'e4a95-50-kepmen-kp-2019-ttg-kaw.konservasi-donggala-buol.....pdf' is no

In [8]:
#clean up a folder by removing non-readable PDF files, 
#thus ensuring that only readable PDF files remain in the folder. 
#It uses the check_pdf_readability function to determine if a PDF is readable, and if not, it deletes the file using os.remove().

import os
import PyPDF2

def check_pdf_readability(file_path):
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfFileReader(file)
            num_pages = reader.numPages
            return num_pages > 0  # If the number of pages is greater than 0, the file is readable
    except Exception:
        return False  # If an error occurs while reading the file, it is not readable

def delete_non_readable_pdfs(folder_path):
    pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
    num_deleted = 0

    for pdf_file in pdf_files:
        file_path = os.path.join(folder_path, pdf_file)
        is_readable = check_pdf_readability(file_path)
        
        if not is_readable:
            os.remove(file_path)
            num_deleted += 1
            print(f"The non-readable PDF file '{pdf_file}' has been deleted.")

    print(f"Total PDF files: {len(pdf_files)}")
    print(f"Non-readable PDF files deleted: {num_deleted}")

# Example usage:
folder_path = "YourDirectoryPath/Desktop/Analysis_ch1/Automated_MPAMP"
delete_non_readable_pdfs(folder_path)

The non-readable PDF file '02c77-48-kepmen-kp-2021.pdf' has been deleted.
The non-readable PDF file '05455-2022kepmen-kp36.pdf' has been deleted.
The non-readable PDF file '054f7-51-kepmen-kp-2019-ttg-kaw.konservasi-parigi-moutong-poso.....pdf' has been deleted.
The non-readable PDF file '05533-kep-29-men-2012-ttg-penetapan-kawasan-konservasi-ujungnegoro.pdf' has been deleted.
The non-readable PDF file '05e15-79-kepmen-kp-2020-ttg-teluk-berau-teluk-nusalasi-papua-barat-20-juli-2020.pdf' has been deleted.
The non-readable PDF file '0d9fc-kepmen-nomor-94-tahun-2021.pdf' has been deleted.
The non-readable PDF file '0f3b6-20-kepmen-kp-2020.pdf' has been deleted.
The non-readable PDF file '16b13-kepmen-86-tahun-2020-tentang-kawasan-konservasi-sumut-edit-hm-.pdf' has been deleted.
The non-readable PDF file '21ab9-52-kepmen-kp-2019-ttg-kaw.konservasi-morowali-morowali-utara.....pdf' has been deleted.
The non-readable PDF file '22%20%20%20%20%20%20KEPMEN-KP%202018.pdf' has been deleted.
The no

##to rename the pdf files in certain folder with the name in the xlx/csv in certain column

import os
import pandas as pd

def rename_pdfs_with_csv(folder_path, csv_file_path):
    pdf_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.pdf')]
    num_renamed = 0

    df = pd.read_excel(csv_file_path, sheet_name="Output")
    filename_mapping = dict(zip(df["File Name"], df["Text"]))

    for pdf_file in pdf_files:
        if pdf_file in filename_mapping:
            file_path = os.path.join(folder_path, pdf_file)
            new_file_name = filename_mapping[pdf_file]
            new_file_path = os.path.join(folder_path, new_file_name + ".pdf")

            # Handle conflicts by appending a unique identifier
            count = 1
            while os.path.exists(new_file_path):
                new_file_path = os.path.join(folder_path, f"{new_file_name}_{count}.pdf")
                count += 1

            os.rename(file_path, new_file_path)
            num_renamed += 1
            print(f"The PDF file '{pdf_file}' has been renamed to '{os.path.basename(new_file_path)}'.")

    print(f"Total PDF files: {len(pdf_files)}")
    print(f"PDF files renamed: {num_renamed}")

# Example usage:
folder_path = r"E:\YourDirectoryPath\Desktop\Analysis_ch1\Automated_MPAMP"
csv_file_path = r"E:\YourDirectoryPath\Desktop\Analysis_ch1\Automated_MPAMP\Excel_Output_2023-06-19 10-39-40.xlsx"
rename_pdfs_with_csv(folder_path, csv_file_path)

A line-by-line explanation of the code:

import os: This line imports the os module, which allows the script to use operating system-dependent functionality, like working with files and directories.

Function extract_url_pdf(input_url, folder_path=os.getcwd()): This line defines the function extract_url_pdf with two parameters: input_url and folder_path. The folder_path parameter has a default value os.getcwd() (the current working directory) if no value is provided when calling the function.

The subsequent lines (3 to 18) import required modules like requests, urljoin, BeautifulSoup, pandas, and datetime.

url = "https://kkp.go.id/djprl/kkhl/page/2107-sk-penetapan": This line sets the URL from which the script will extract the PDF files. However, note that the URL is hard-coded, so it might not be used directly in the function.

folder_location = "...": This line sets the path of the local folder where the downloaded PDF files will be stored. The path in this line is hard-coded, so you might want to change it according to your desired location.

response = requests.get(url): This line sends a GET request to the specified URL and stores the response in the response variable.

soup = BeautifulSoup(response.text, "html.parser"): This line creates a BeautifulSoup object soup by parsing the HTML content of the response.

13 to 36: This block of code performs the following tasks:

Iterates through all anchor tags (<a>) with a hyperlink ending in '.pdf' in the parsed HTML content.
Extracts the PDF files by downloading them using the requests.get() method.
Appends relevant information such as link texts, URLs, and file names to lists.
Prints the progress and the number of files extracted.
38 to 43: This block of code creates a pandas DataFrame df containing the extracted link texts, URLs, and file names.

45 to 50: This block of code creates an Excel file with the DataFrame df and saves it in the specified folder. The Excel file's name includes the current timestamp to ensure uniqueness.

In summary, this function extracts PDF files from a given URL, downloads them to a local folder, and creates an Excel file with information about the downloaded files. Note that the function uses hard-coded values for the URL and folder location, so it may need some modifications to be more dynamic and suitable for your specific use case.