In [12]:
import requests  # For HTTP requests
import re  # For regular expressions
import os  # For interacting with the operating system
import time  # For time-related operations
import json  # For handling JSON data
import shutil  # For file operations like copy, move, delete, etc.
import pandas as pd  # For data manipulation and analysis
from openpyxl import Workbook  # For creating and manipulating Excel workbooks



# Mapping of document types to their corresponding names in the World Bank API
FileNameConvertDict = {
    'icrr': 'Implementation+Completion+Report+Review',
    'icr': "Implementation+Completion+and+Results+Report",
    'ppar': "Project+Performance+Assessment+Report",
    'pad': 'Project+Appraisal+Document',
    'scd': 'Systematic+Country+Diagnostic',
    'isr': 'Implementation+Status+and+Results+Report',
    'pd': 'Program+Document',
    'esmp': 'Environmental+and+Social+Management+Plan',
    'cas': 'Country+Assistance+Strategy+Document',
    'cpf': 'Country+Partnership+Framework',
    'cpe': 'IEG+Evaluation',
    'cen': 'Country+Engagement+Note',
    'cren': 'Country+Re-engagement+Note',
    'isn': 'Interim+Strategy+Note',
    'cpe': 'Country+Program+Evaluation',
    'asa': 'Economic+%26+Sector+Work',
    'clrr': 'CAS+Completion+Report+Review',
    'pp': 'Project+Paper',
}

class SingleTypeBulkDownload(object):
    
    def __init__(self, input_dict, document_type_raw):
        """
        Initialize SingleTypeBulkDownload instance.

        Args:
            input_dict (dict): A dictionary containing the input parameters.
            document_type_raw (str): The raw document type.
        """
        self.root_path = input_dict['root_path']  # Set the root path for the download
        self.document_type_raw = document_type_raw  # Set the raw document type
        self.output_path = os.path.join(self.root_path, f"{self.document_type_raw}_folder")  # Create the output path for the downloaded files
        self.category_path = os.path.join(self.root_path, f"{self.document_type_raw}_category_folder")  # Create the category path for catalog files
        # Create the necessary folders if they don't exist
        for path in self.root_path, self.output_path, self.category_path:
            if not os.path.exists(path):
                os.mkdir(path)
        self.start_date = input_dict.get('start_date')  # Get the start date from the input dictionary
        self.end_date = input_dict.get('end_date')  # Get the end date from the input dictionary
        self.document_type = FileNameConvertDict.get(self.document_type_raw)  # Get the converted document type
        self.projid_list = input_dict.get('projid_list')  # Get the project ID list from the input dictionary
        self.local_time = time.strftime("%m-%d-%Y", time.localtime())  # Get the current local time
        self.download_version = input_dict.get('download_version', 'txt')  # Get the download version from the input dictionary

    def read_catalog_files(self):
        """
        Read catalog files and yield the query file dictionary and file name.
        """
        file_name_list = os.listdir(self.category_path)  # Get the list of file names in the category path directory
        file_name_list = [file_name for file_name in file_name_list if file_name.startswith(self.document_type_raw) and file_name.endswith(".txt")]  # Filter file names based on document type and extension
        for file_name in file_name_list:  # Iterate over the filtered file names
            with open(os.path.join(self.category_path, file_name), 'r', encoding='utf-8', errors='ignore') as f:  # Open the file for reading
                query_file_dict = json.loads(f.read()).get('documents')  # Parse the JSON content of the file into a dictionary
                yield (query_file_dict, file_name)  # Yield the query file dictionary and file name as a tuple
   
     
       
    def generate_catalog_for_projects(self):
        """
        Generate catalog files for the project-based method.
        """
        failed_requests_count = 0  # Counter for failed requests
        file_name_list = os.listdir(self.category_path)  # List of existing file names in the category path directory
        for projid in self.projid_list:  # Iterate over each project ID
            file_name = f'{self.document_type_raw}_{projid}_{self.local_time}.txt'  # Generate the file name for the catalog file
            if file_name not in file_name_list:  # Check if the file does not already exist
                query_url_form = f"http://search.worldbank.org/api/v2/wds?format=json&proid={projid}&lang_exact=English&docty_exact={self.document_type}&srt=docdt&order=desc"  # Construct the query URL for the project-based method
                try:
                    query = requests.get(query_url_form)  # Make a GET request to the query URL
                    query_text = query.text  # Get the response text

                    if len(query_text) > 1000:  # Check if the response is valid (arbitrary threshold of length > 1000)
                        with open(os.path.join(self.category_path, file_name), 'w', encoding='utf-8', errors='ignore') as f:
                            f.write(query_text)  # Write the response text to the catalog file
                except:
                    failed_requests_count += 1  # Increment the counter for failed requests
                    print(f'No response ({failed_requests_count} failed requests)')  # Print a failure message with the count
                    pass  # Continue to the next iteration if an exception occurs (ignore the exception)
   
    def download_documents_for_projects(self):
        """
        Download documents using the project-based method.
        """
        print(f'Start downloading the {self.document_type_raw} data from WBG API ...')
        t1 = time.time()  # Start time of the download process
        file_name_list = os.listdir(self.output_path)  # List of file names in the output path directory
        for query_dict, file_name in self.read_catalog_files():  # Iterate over the query dictionaries and file names obtained from read_catalog_files()
            projid_match = re.search(r"(P\d+)", file_name, flags=re.S | re.I)  # Search for the project ID pattern in the file name
            if projid_match:
                projid = projid_match.group(0)  # Extract the project ID from the match object
                doc_id_list = [doc_id for doc_id in query_dict.keys() if doc_id.startswith('D')]  # Filter document IDs based on the key starting with 'D'
                for doc_id in doc_id_list:  # Iterate over the filtered document IDs
                    try:
                        date = query_dict.get(doc_id).get('docdt')[:10]  # Extract the date associated with the document ID
                        if self.download_version == 'txt':
                            file_name = f'{self.document_type_raw}_{projid}_{date}_{doc_id}.txt'  # Construct the file name for text file
                        else:
                            file_name = f'{self.document_type_raw}_{projid}_{date}_{doc_id}.pdf'  # Construct the file name for PDF file
                        if file_name not in file_name_list:  # Check if the file name is not in the list of existing file names
                            texturl = query_dict.get(doc_id).get('txturl')  # Get the URL for the text version of the document
                            pdfurl = query_dict.get(doc_id).get('pdfurl')  # Get the URL for the PDF version of the document
                            try:
                                if self.download_version == 'txt':  # Downloading the text version of the document
                                    file_query = requests.get(texturl)  # Get the document content from the URL
                                    file_text = file_query.text  # Extract the text content from the response
                                    with open(os.path.join(self.output_path, file_name), 'w', encoding='utf-8', errors='ignore') as f:  # Open a file in write mode and save the text content
                                        f.write(file_text)
                                else:  # Downloading the PDF version of the document
                                    file_query = requests.get(pdfurl)  # Get the document content from the URL
                                    file_text = file_query.content  # Extract the binary content from the response
                                    with open(os.path.join(self.output_path, file_name), 'wb') as f:  # Open a file in write binary mode and save the binary content
                                        f.write(file_text)
                            except:
                                pass
                    except:
                        print(f'Warning! {doc_id} failed to get docdt')
                        pass
        t2 = time.time()  # End time of the download process
        print('Data downloaded successfully!')
        print(f'You can find them in the folder {self.output_path}')

    def run(self):
        """
        Run the bulk download process based on the input parameters.
        """
        if self.projid_list == None:
            self.generate_catalog_for_dates()  # Generate catalog files for the date-based method
            self.download_documents_for_dates()  # Download documents using the date-based method
        else:
            self.generate_catalog_for_projects()  # Generate catalog files for the project-based method
            self.download_documents_for_projects()  # Download documents using the project-based method

    def __call__(self):
        """
        Call the 'run' method when the instance is called as a function.
        """
        return self.run()

    

def read_project_ids_from_excel(excel_path, column_name='Proj.Id'):
    try:
        df = pd.read_excel(excel_path)
        return df[column_name].dropna().tolist()
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return []

def bulk_download_projectID(input_dict):
    document_type_list = input_dict.get('document_type')

    if document_type_list is None:
        print("Error: 'file_type' is missing or set to None in the input dictionary.")
        return
    for document_type in document_type_list:
        if document_type not in document_type_list:
            continue
        SingleTypeBulkDownload(input_dict, document_type)()
        print('-' * 50)


if __name__ == '__main__':
    
    download_folder_path = '' # Download Folder Path
    selected_file_type = [''] # Choose from ["icrr", "icr", "ppar", "pad", "scd", "isr", "pd", "esmp", "cas", "cpf", "cpe", "cen", "cren", "isn", "cpe", "asa", "clrr", "pp"]
    download_version = 'txt' # Choose from Either txt or pdf
    excel_path = '.xlsx' # Path to the excel file containing the Project Ids to be downloaded
    project_ids = read_project_ids_from_excel(excel_path)

    input_dict = {
        'root_path': download_folder_path,
        'document_type': selected_file_type,
        'projid_list': project_ids,
        'download_version': download_version
    }

    bulk_download_projectID(input_dict)
    
    

selected_file_type = selected_file_type[0] if isinstance(selected_file_type, list) else selected_file_type

# Function to extract project ID from a file name
def extract_project_id(file_name):
    parts = file_name.split('_')
    return parts[1] if len(parts) >= 2 else None

# Function to extract project ID and GUID from a JSON file
def extract_project_id_and_guid_from_json(json_path):
    try:
        with open(json_path, 'r') as json_file:
            json_data = json.load(json_file)
            documents = json_data.get("documents", {})
            for doc_info in documents.values():
                project_id = doc_info.get("projectid")
                guid = doc_info.get("guid")
                if project_id and guid:
                    return project_id, guid
    except (FileNotFoundError, json.JSONDecodeError):
        pass
    return None, None

# Function to rename files based on project ID and GUID
def rename_actual_file(file_path, project_id, guid):
    base_name = os.path.basename(file_path)
    parts = base_name.split('_')
    if len(parts) >= 4:
        extension = parts[-1].split('.')[1]
        new_file_name = f'{project_id}_{selected_file_type}_{guid}.{extension}'
        new_path = os.path.join(os.path.dirname(file_path), new_file_name)
        os.rename(file_path, new_path)

# Function to rename all files in a folder based on project IDs and GUIDs from a JSON source
def rename_actual_files(actual_files_folder, json_files_folder):
    # Map project IDs to GUIDs from JSON files
    project_id_to_guid = {}
    for json_file in os.listdir(json_files_folder):
        if json_file.endswith(".txt"):
            json_path = os.path.join(json_files_folder, json_file)
            project_id, guid = extract_project_id_and_guid_from_json(json_path)
            if project_id and guid:
                project_id_to_guid[project_id] = guid

    # Rename files in the folder based on project ID and GUID
    for file_name in os.listdir(actual_files_folder):
        if file_name.endswith(".pdf") or file_name.endswith(".txt"):
            project_id = extract_project_id(file_name)
            if project_id in project_id_to_guid:
                guid = project_id_to_guid[project_id]
                rename_actual_file(os.path.join(actual_files_folder, file_name), project_id, guid)

# Rename files and create Excel summary
actual_files_folder = f"{download_folder_path}/{selected_file_type}_folder"
json_files_folder = f"{download_folder_path}/{selected_file_type}_category_folder"

rename_actual_files(actual_files_folder, json_files_folder)

# Function to create an Excel summary for downloaded files
def create_excel_summary(folder_path):
    # Initialize the Excel workbook and sheet
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = "Downloaded Files"

    # Set column headers
    sheet["A1"] = "Proj.Id"
    sheet["B1"] = "File Type"

    # Populate the Excel sheet with file information
    row_num = 2
    for file_name in os.listdir(folder_path):
        if "_" in file_name and "." in file_name:
            parts = file_name.split("_")
            project_id = parts[0]
            file_type = parts[1]
            sheet.cell(row=row_num, column=1, value=project_id)
            sheet.cell(row=row_num, column=2, value=file_type)
            row_num += 1

    # Ensure the folder exists and create it if not
    os.makedirs(folder_path, exist_ok=True)  

    # Save the Excel workbook in the specified output folder
    workbook.save(f"{download_folder_path}/{selected_file_type}_Downloaded_Project_IDs.xlsx")

# Example folder path for creating the Excel summary

# Call the function to create the Excel summary
create_excel_summary(f"{download_folder_path}/{selected_file_type}_folder")


Error reading Excel file: [Errno 2] No such file or directory: '.xlsx'


FileNotFoundError: [Errno 2] No such file or directory: ''