In [7]:
import requests  # Library for making HTTP requests
import re  # Library for regular expressions
import os  # Library for interacting with the operating system
import time  # Library for time-related functions
import json  # Library for working with JSON data
import shutil  # Library for file operations and manipulation
from openpyxl import Workbook

In [9]:


# Mapping of document types to their corresponding names in the World Bank API
FileNameConvertDict = {
    'icrr': 'Implementation+Completion+Report+Review',
    'icr': "Implementation+Completion+and+Results+Report",
    'ppar': "Project+Performance+Assessment+Report",
    'pad': 'Project+Appraisal+Document',
    'scd': 'Systematic+Country+Diagnostic',
    'isr': 'Implementation+Status+and+Results+Report',
    'pd': 'Program+Document',
    'esmp': 'Environmental+and+Social+Management+Plan',
    'cas': 'Country+Assistance+Strategy+Document',
    'cpf': 'Country+Partnership+Framework',
    'cpe': 'IEG+Evaluation',
    'cen': 'Country+Engagement+Note',
    'cren': 'Country+Re-engagement+Note',
    'isn': 'Interim+Strategy+Note',
    'cpe': 'Country+Program+Evaluation',
    'asa': 'Economic+%26+Sector+Work%5EEconomic+%26amp%3B+Sector+Work%5EPublications%5EPublications+%26+Research%5EPublications+%26amp%3B+Research',
    'clrr': 'CAS+Completion+Report+Review',
    'pp': 'Project+Paper',
    'pid': 'Project+Information+Document'
}


class SingleTypeBulkDownload(object):
    
    def __init__(self, input_dict, document_type_raw):
        """
        Initialize SingleTypeBulkDownload instance.

        Args:
            input_dict (dict): A dictionary containing the input parameters.
            document_type_raw (str): The raw document type.
        """
        self.root_path = input_dict['root_path']  # Set the root path for the download
        self.document_type_raw = document_type_raw  # Set the raw document type
        self.output_path = os.path.join(self.root_path, f"{self.document_type_raw}_folder")  # Create the output path for the downloaded files
        self.category_path = os.path.join(self.root_path, f"{self.document_type_raw}_category_folder")  # Create the category path for catalog files
        # Create the necessary folders if they don't exist
        for path in self.root_path, self.output_path, self.category_path:
            if not os.path.exists(path):
                os.mkdir(path)
        self.start_date = input_dict.get('start_date')  # Get the start date from the input dictionary
        self.end_date = input_dict.get('end_date')  # Get the end date from the input dictionary
        self.rows = input_dict.get('file_number')  # Get the file number from the input dictionary
        self.document_type = FileNameConvertDict.get(self.document_type_raw)  # Get the converted document type
        self.projid_list = input_dict.get('projid_list')  # Get the project ID list from the input dictionary
        self.local_time = time.strftime("%m-%d-%Y", time.localtime())  # Get the current local time
        self.download_version = input_dict.get('download_version', 'txt')  # Get the download version from the input dictionary

    def read_catalog_files(self):
        """
        Read catalog files and yield the query file dictionary and file name.
        """
        file_name_list = os.listdir(self.category_path)  # Get the list of file names in the category path directory
        file_name_list = [file_name for file_name in file_name_list if file_name.startswith(self.document_type_raw) and file_name.endswith(".txt")]  # Filter file names based on document type and extension
        for file_name in file_name_list:  # Iterate over the filtered file names
            with open(os.path.join(self.category_path, file_name), 'r', encoding='utf-8', errors='ignore') as f:  # Open the file for reading
                query_file_dict = json.loads(f.read()).get('documents')  # Parse the JSON content of the file into a dictionary
                yield (query_file_dict, file_name)  # Yield the query file dictionary and file name as a tuple
   
    def generate_catalog_for_dates(self):
        """
        Generate catalog files for the date-based method.
        """
        file_name_list = os.listdir(self.category_path)  # Get the list of file names in the category path directory
        file_name = f"{self.document_type_raw}_{self.end_date}_{self.rows}_{self.local_time}.txt"  # Generate the file name for the catalog file
        if file_name not in file_name_list:  # Check if the file name does not already exist
            query_url_form = f"http://search.worldbank.org/api/v2/wds?format=json&docty_exact={self.document_type}&lndinstr_key=1353856&lang_exact=English&end_docdt={self.end_date}&rows={self.rows}&srt=docdt&order=desc"  # Construct the query URL for the date-based method
            query = requests.get(query_url_form)  # Make a GET request to the query URL
            query_text = query.text  # Get the response text
            if len(query_text) > 1000:  # Check if the response is valid (arbitrary threshold of length > 1000)
                with open(os.path.join(self.category_path, file_name), 'w', encoding='utf-8', errors='ignore') as f:  # Open a file for writing in the category path and save the response text
                    f.write(query_text)
                
    def download_documents_for_dates(self):
        """
        Download documents using the date-based method.
        """
        print()
        print(f'Start downloading the {self.document_type_raw} data from WBG API ...')
        t1 = time.time()  # Start time of the download process
        file_name_list = os.listdir(self.output_path)  # Get the list of file names in the output path directory

        for query_dict, file_name in self.read_catalog_files():  # Iterate over the query dictionaries and file names obtained from read_catalog_files()
            doc_id_list = [
                doc_id for doc_id in query_dict.keys()
                if doc_id.startswith('D')
            ]  # Filter document IDs based on the key starting with 'D'
            for doc_id in doc_id_list:  # Iterate over the filtered document IDs
                date = query_dict.get(doc_id).get('docdt')[:10]  # Extract the date associated with the document ID
                if self.start_date <= date <= self.end_date:  # Check if the date is within the specified range
                    if self.download_version == 'txt':
                        file_name = f'{date}_{doc_id}_{self.document_type_raw}.txt'  # Construct the file name for text file
                    else:
                        file_name = f'{date}_{doc_id}_{self.document_type_raw}.pdf'  # Construct the file name for PDF file
                    if file_name not in file_name_list:  # Check if the file name is not in the list of existing file names
                        texturl = query_dict.get(doc_id).get('txturl')  # Get the URL for the text version of the document
                        pdfurl = query_dict.get(doc_id).get('pdfurl')  # Get the URL for the PDF version of the document
                        try:
                            if self.download_version == 'txt':  # Downloading the text version of the document
                                file_query = requests.get(texturl)  # Get the document content from the URL
                                file_text = file_query.text  # Extract the text content from the response
                                with open(os.path.join(self.output_path, file_name), 'w') as f:  # Open a file in write mode and save the text content
                                    f.write(file_text)
                            else:  # Downloading the PDF version of the document
                                file_query = requests.get(pdfurl)  # Get the document content from the URL
                                file_text = file_query.content  # Extract the binary content from the response
                                with open(os.path.join(self.output_path, file_name), 'wb') as f:  # Open a file in write binary mode and save the binary content
                                    f.write(file_text)
                        except:
                            pass  # Ignore any exceptions that occur during the download process
        t2 = time.time()  # End time of the download process
        print('Data downloaded successfully!')
        print(f'You can find them in the folder {self.output_path}')  

    def run(self):
        """
        Run the bulk download process based on the input parameters.
        """
        if self.projid_list == None:
            self.generate_catalog_for_dates()  # Generate catalog files for the date-based method
            self.download_documents_for_dates()  # Download documents using the date-based method
        else:
            self.generate_catalog_for_projects()  # Generate catalog files for the project-based method
            self.download_documents_for_projects()  # Download documents using the project-based method

    def __call__(self):
        """
        Call the 'run' method when the instance is called as a function.
        """
        return self.run()


def bulk_download_date(input_dict):
    """
    Perform bulk download of World Bank documents based on the input parameters.

    Args:
        input_dict (dict): A dictionary containing the input parameters.
    """
    print(os.getcwd())
    document_type_list = input_dict.get('document_type')
    for document_type in document_type_list:
        SingleTypeBulkDownload(input_dict, document_type)()
        print('-' * 50)
        

    
if __name__ == '__main__':
    
    
    download_folder_path = '/Users/adilqasin/Documents/WBG/PID_Project_Components/All Files 2000-24/2000-15' # Download Folder Path
    selected_file_type = ['pid'] # Choose from ["icrr", "icr", "ppar", "pad", "scd", "isr", "pd", "esmp", "cas", "cpf", "cpe", "cen", "cren", "isn", "cpe", "asa", "clrr", "pp"]
    selected_download_version = 'txt' # Choose from Either txt or pdf
    start_date = '2000-01-01' # Start date of the documents
    end_date = '2015-01-01' # End date of the documents 

    input_dict = {
        'root_path': download_folder_path, #"Download Folder Path"
        'document_type': 'pid', 
        'file_number': 99999999, 
        'start_date': start_date,
        'end_date': end_date,
        'download_version': selected_download_version
    }

    bulk_download_date(input_dict)   # Call the bulk_download_date function with the input parameters



# Function to extract Document ID from a filename
def extract_document_id(filename):
    pattern = r'D(\d{8})'
    match = re.search(pattern, filename)
    if match:
        return 'D' + match.group(1)
    else:
        return None


# Function to generate the desired renamed filename
def generate_renamed_filename(project_id, guid, original_filename):
    # Extract the remaining part of the original filename (excluding date and document ID)
    remaining_part = original_filename.split('_', 2)[-1]

    # Split the remaining_part using '.' as the delimiter
    filetype, extension = remaining_part.split('.', 1)

    # Generate the desired renamed filename
    new_filename = f"{project_id}_{filetype}_{guid}.{extension}"
    return new_filename


# Corrected approach to ensure correct path
selected_file_type = selected_file_type[0] if isinstance(selected_file_type, list) else selected_file_type

# Function to process JSON and rename files
def process_files(download_folder_path, selected_file_type):
    json_folder_path = f"{download_folder_path}/{selected_file_type}_category_folder"
    json_file_path = None

    # Find the JSON file in the folder
    for filename in os.listdir(json_folder_path):
        if filename.endswith(".txt"):
            json_file_path = os.path.join(json_folder_path, filename)
            break

    if json_file_path is None:
        print("No JSON file found in the JSON folder.")
        return

    # Read and parse the JSON data
    with open(json_file_path, "r") as json_file:
        json_data = json.loads(json_file.read())

    # Path to the folder containing the other files
    other_files_folder_path = f"{download_folder_path}/{selected_file_type}_folder"

    # Iterate over the list of filenames in the folder with other files
    for filename in os.listdir(other_files_folder_path):
        # Skip files that are not JSON
        if filename.endswith(".json"):
            continue
        
        # Extract Document ID from the filename
        target_doc_id = extract_document_id(filename)
        
        if target_doc_id is not None and target_doc_id in json_data["documents"]:
            # Extract "projectid" and "guid" for the target document
            target_doc_info = json_data["documents"][target_doc_id]
            project_id = target_doc_info.get("projectid", "")
            guid = target_doc_info.get("guid", "")

            # Generate the desired renamed filename
            new_filename = generate_renamed_filename(project_id, guid, filename)

            # Rename the file
            original_filepath = os.path.join(other_files_folder_path, filename)
            new_filepath = os.path.join(other_files_folder_path, new_filename)
            os.rename(original_filepath, new_filepath)


# Function to create the Excel summary
def create_excel_summary(download_folder_path, selected_file_type):
    folder_path = f"{download_folder_path}/{selected_file_type}_folder"

    ## Create a new Excel workbook
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = "Downloaded Files"  # Set the sheet name

    # Set the column headers
    sheet["A1"] = "Proj.Id"
    sheet["B1"] = "File Type"

    # Initialize row counter
    row_num = 2

    # List all files in the folder
    file_list = os.listdir(folder_path)

    # Iterate through the files and extract project ID and file type
    for filename in file_list:
        if "_" in filename and "." in filename:
            parts = filename.split("_")
            project_id = parts[0]
            file_type = parts[1]
            sheet.cell(row=row_num, column=1, value=project_id)
            sheet.cell(row=row_num, column=2, value=file_type)
            row_num += 1

    # Create the output folder if it doesn't exist
    output_folder = f"{download_folder_path}"
    os.makedirs(output_folder, exist_ok=True)

    # Save the Excel workbook in the output folder
    excel_filename = os.path.join(output_folder, f"{selected_file_type}_Downloaded_Project_IDs.xlsx")
    workbook.save(excel_filename)

    print(f"Download Summary saved as '{excel_filename}'")


# Main script execution
#download_folder_path, selected_file_type = download_files()
process_files(download_folder_path, selected_file_type)
create_excel_summary(download_folder_path, selected_file_type)

print("Process completed successfully!")



/Users/adilqasin/Documents/WBG/Updated DataBricks+Local 2/Download Documents/Bulk Download

Start downloading the p data from WBG API ...
Data downloaded successfully!
You can find them in the folder /Users/adilqasin/Documents/WBG/PID_Project_Components/All Files 2000-24/2000-15/p_folder
--------------------------------------------------

Start downloading the i data from WBG API ...
Data downloaded successfully!
You can find them in the folder /Users/adilqasin/Documents/WBG/PID_Project_Components/All Files 2000-24/2000-15/i_folder
--------------------------------------------------

Start downloading the d data from WBG API ...
Data downloaded successfully!
You can find them in the folder /Users/adilqasin/Documents/WBG/PID_Project_Components/All Files 2000-24/2000-15/d_folder
--------------------------------------------------


FileNotFoundError: [Errno 2] No such file or directory: '/Users/adilqasin/Documents/WBG/PID_Project_Components/All Files 2000-24/2000-15/pid_category_folder'