# 1) Import libraries

In [33]:
import re  # # Importing the 're' module for regular expressions
import os  # Importing the os module for operating system related tasks
import pandas as pd  # Importing the pandas library for data manipulation
from PDFDataExtractor import PDFDataExtractor  # Importing the PDFDataExtractor class from the PdfDataExtractor module


# 2) Data Extraction

In [35]:
def extract_data_from_pdf_files(folder_path, output_folder, output_file):
    """
    Extract data from PDF files in the given folder and save it to an Excel file.

    Parameters:
    - folder_path (str): The path to the folder containing the PDF files.
    - output_folder (str): The path to the output folder.
    - output_file (str): The name of the output Excel file.

    """
    df_list = []

    # Iterate over the PDF files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(folder_path, file_name)
            
            # Create an instance of the PdfDataExtractor class with the PDF file path
            pdf_extractor = PDFDataExtractor(file_path)

            # Extract the country name
            country_name = pdf_extractor.extract_country_name()

            # Extract the date
            date = pdf_extractor.extract_date()

            # Extract the fiscal year
            fiscal_year = pdf_extractor.extract_fiscal_year()

            # Extract the WBG performance text
            WBG_performance_text = pdf_extractor.extract_WBG_performance()
            
            # Extract the development outcome text
            development_outcome_text = pdf_extractor.extract_development_outcome()
            
            # Extract the lessons text
            lessons_text = pdf_extractor.extract_lessons()

            # Extract the executive summary text
            summary_text = pdf_extractor.extract_executive_summary()

            # Extract the overall assessment and ratings text
            assessment_text = pdf_extractor.extract_overall_assessment()

            # Extract the strategic focus text
            strategic_text = pdf_extractor.extract_strategic_focus()

            
            # Extract the desired data
            assessment_CLR_text = pdf_extractor.extract_assessment_clr()
            # Additional data extraction can be added here
            # Extract the first paragraph from the assessment CLR text
            paragraphs = re.split(r'\n\n+', assessment_CLR_text)
            first_paragraph = paragraphs[0]
            
            # Create a DataFrame with the extracted data
            df = pd.DataFrame({
                'Country': [country_name],
                'Date': [date],
                'Fiscal Year': [fiscal_year],
                'Executive Summary Section': [summary_text],
                'Strategic Focus section': [strategic_text],
                'Development Outcome Section': [development_outcome_text],
                'WBG Performance Section Section': [WBG_performance_text],
                'Assessment of CLR Section': [first_paragraph],
                'Findings and Lessons Section': [lessons_text],
                'Assessment and Ratings Section': [assessment_text]
               })

            df_list.append(df)

            # Close the PdfDataExtractor instance
            pdf_extractor.close()

    # Concatenate all the DataFrames into a single DataFrame
    df_combined = pd.concat(df_list)

    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Write the combined DataFrame to an Excel file in the output folder
    output_path = os.path.join(output_folder, output_file)
    df_combined.to_excel(output_path, index=False)

    print(f'Data written to {output_path}')


# Example usage
folder_path = '/Users/adilqasin/Documents/Munib Projects/1 project/new' # Provide PDF folder path 
output_folder = '/Users/adilqasin/Documents/Munib Projects/1 project/output' # Provide path to the output folder
output_file = "tem.xlsx" # output file name
extract_data_from_pdf_files(folder_path, output_folder, output_file)

Data written to /Users/adilqasin/Documents/Munib Projects/1 project/output/tem.xlsx


# Comments 

Please make sure PDFDataExtractor.py is in the same directory as this main code 