In [8]:
import requests
import pdfplumber
import pandas as pd
from io import BytesIO

# Download PDF from URL
url = "https://hdr.undp.org/system/files/documents/global-report-document/hdr2023-24overviewen.pdf"
response = requests.get(url)
pdfFile = BytesIO(response.content) # To avoid saving files to disk

# List out the column headers from the PDF table
customColumns = [
    "Rank", "HDI Rank (2021)", "Country", "HDI", "IHDI", "Loss due to inequality (%)",
    "GDI", "GDI Group", "GII", "GII Rank", "Maternal mortality ratio",
    "Adolescent birth rate", "Female seats in parliament (%)",
    "Female secondary education (%)", "Male secondary education (%)",
    "Female labour force participation (%)", "Male labour force participation (%)",
    "Multidimensional Poverty Index (MPI)", "Intensity of deprivation (%)",
    "Planetary pressures–adjusted HDI", "Difference from HDI"
]

# List the pages you want to extract the tables from
pagesToExtract = list(range(41, 47))  # i.e. pages 41 to 46

# Create a list to store all extracted DataFrames
allDataFrames = []

# Loop through each page to extract tables
with pdfplumber.open(pdfFile) as pdf:
    for pageNum in pagesToExtract:
        page = pdf.pages[pageNum - 1]
        tables = page.extract_tables() # Call the extract_tables() method to look for tables

        print(f"\n Page {pageNum} — {len(tables)} tables found")

        for i, table in enumerate(tables):
            df = pd.DataFrame(table) # Convert table into Pandas DataFrame

            # Skip the first row of the table 
            df = df[1:]

            # Only keep as many columns as defined and assign headers
            df = df.iloc[:, :len(customColumns)] # Use Pandas int location method to select data
            df.columns = customColumns[:df.shape[1]]

            # Replace any missing values with NA
            df.replace(["..", "—", "", None], pd.NA, inplace=True)

            # Add to the list of Dataframes
            allDataFrames.append(df)

# Concatenate all tables into one
final_df = pd.concat(allDataFrames, ignore_index=True)

# Uncomment if you want to see all the rows
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_rows', None)
# display(final_df)

# Show the first few rows
display(final_df.head(20))

# Optional: save to CSV
# final_df.to_csv("hdr_combined_table.csv", index=False)


 Page 41 — 1 tables found

 Page 42 — 1 tables found

 Page 43 — 1 tables found

 Page 44 — 1 tables found

 Page 45 — 0 tables found

 Page 46 — 0 tables found


Unnamed: 0,Rank,HDI Rank (2021),Country,HDI,IHDI,Loss due to inequality (%),GDI,GDI Group,GII,GII Rank,Maternal mortality ratio,Adolescent birth rate,Female seats in parliament (%),Female secondary education (%)
0,1,Switzerland,0.967,0.891,7.9,0.971,2,0.018,3.0,,,,0.826,14.6
1,2,Norway,0.966,0.903,6.5,0.986,1,0.012,2.0,,,,0.808,16.4
2,3,Iceland,0.959,0.91,5.1,0.975,1,0.039,9.0,,,,0.806,16.0
3,4,"Hong Kong, China (SAR)",0.956,0.84,12.1,0.972,2,,,,,,,
4,5,Denmark,0.952,0.898,5.7,0.981,1,0.009,1.0,,,,0.839,11.9
5,5,Sweden,0.952,0.878,7.8,0.983,1,0.023,4.0,,,,0.839,11.9
6,7,Germany,0.95,0.881,7.3,0.966,2,0.071,19.0,,,,0.833,12.3
7,7,Ireland,0.95,0.886,6.7,0.991,1,0.072,20.0,,,,0.814,14.3
8,9,Singapore,0.949,0.825,13.1,0.991,1,0.036,8.0,,,,0.745,21.5
9,10,Australia,0.946,0.86,9.1,0.978,1,0.063,17.0,,,,0.763,19.3
