In [65]:
import pdfplumber
import pandas as pd

with pdfplumber.open('mcs2024.pdf') as pdf:
    # iterate over each page
    p0 = pdf.pages[33]
    text = p0.extract_text()

In [66]:
text

'30\nABRASIVES (MANUFACTURED)\n(Fused aluminum oxide, silicon carbide, and metallic abrasives)\n(Data in metric tons unless otherwise specified)\nDomestic Production and Use: In 2023, fused aluminum oxide was produced by two companies at three plants in\nthe United States and Canada. Production of crude fused aluminum oxide had an estimated value of $3.4 million.\nSilicon carbide was produced by two companies at two plants in the United States. Production of crude silicon carbide\nhad an estimated value of about $28 million. Metallic abrasives were produced by 11 companies in eight States.\nProduction of metallic abrasives had an estimated value of about $130 million, and metallic abrasive shipments were\nvalued at $140 million. Bonded and coated abrasive products accounted for most abrasive uses of fused aluminum\noxide and silicon carbide. Metallic abrasives are used primarily for steel shot and grit and cut wire shot, which are\nused for sandblasting, peening, and stonecutting appli

## Extract the elements between parenthesis

In [67]:
import pdfplumber
import re

# Define the correct path to the PDF file
pdf_path = "mcs2024.pdf"

# Function to extract text between parentheses after the title and before the next heading
def extract_relevant_text(text):
    # Find the text between 'ABRASIVES (MANUFACTURED)' and 'Domestic Production and Use:'
    pattern = re.compile(r'ABRASIVES \(MANUFACTURED\)(.*)Domestic Production and Use:', re.DOTALL)
    match = pattern.search(text)
    if match:
        relevant_text = match.group(1)
        # Extract text within parentheses and remove parentheses
        parentheses_text = re.findall(r'\((.*?)\)', relevant_text)
        return parentheses_text
    return []

# Open the PDF file and extract text from page 33
with pdfplumber.open(pdf_path) as pdf:
    page = pdf.pages[33]  # Page 33 is index 32
    text = page.extract_text()

# Extract the relevant text
parentheses_text = extract_relevant_text(text)
print(parentheses_text)


['Fused aluminum oxide, silicon carbide, and metallic abrasives', 'Data in metric tons unless otherwise specified']


## Extract the table

In [123]:
import pdfplumber
import re

# Define the correct path to the PDF file
pdf_path = "mcs2024.pdf"

# Function to extract text between the specified headings
def extract_text_between_headings(text, start_heading, end_heading):
    # Find the text between start_heading and end_heading
    pattern = re.compile(re.escape(start_heading) + r'(.*?)' + re.escape(end_heading), re.DOTALL)
    match = pattern.search(text)
    if match:
        return match.group(1).strip()
    return ""

# Open the PDF file and extract text from page 33
with pdfplumber.open(pdf_path) as pdf:
    page = pdf.pages[56]  # Page 33 is index 32
    text = page.extract_text()

# Extract the text between 'ABRASIVES (MANUFACTURED)' and 'Domestic Production and Use:'
start_heading = "World Refinery Production and Reserves"
# start_heading = "World Smelter Production and Capacity"
end_heading = "World Resources"
relevant_text = extract_text_between_headings(text, start_heading, end_heading)

# relevant_text = f"""
# {relevant_text}
# """

In [128]:
tab_count = text.count('\t')

tab_count

# text

0

## Ignore

In [44]:
import re
import pandas as pd

# Sample text containing the table
text = """
: Fused aluminum oxide capacity data for Austria were revised based on company reports.
Fused aluminum oxidee Silicon carbidee
2022 2023 2022 2023
United States — — 40,000 40,000
United States and Canada 60,000 60,000 — —
Australia 50,000 50,000 — —
Austria 90,000 90,000 — —
Brazil 50,000 50,000 40,000 40,000
China 800,000 800,000 450,000 450,000
France 40,000 40,000 20,000 20,000
Germany 80,000 80,000 35,000 35,000
India 40,000 40,000 5,000 5,000
Japan 15,000 15,000 60,000 60,000
Mexico — — 45,000 45,000
Norway — — 80,000 80,000
Venezuela — — 30,000 30,000
Other countries 80,000 80,000 200,000 200,000
World total (rounded) 1,310,000 1,300,000 1,000,000 1,000,000
"""

# Regular expression to match the table header and data
header_pattern = re.compile(r'(.*\n)+\d{4} \d{4} \d{4} \d{4}\n')
data_pattern = re.compile(r'(\d{4} \d{4} \d{4} \d{4}\n)(.*?)(?=\n\n|\Z)', re.DOTALL)

# Find the header in the text
header_match = header_pattern.search(text)
if header_match:
    header_text = header_match.group(0)
    # Extract potential headers from the matched text
    headers = header_text.strip().split('\n')[-1].split()
    print(f'headers: {headers}')
else:
    headers = []

# Generate column names
if not headers:
    headers = [f"Col {i+1}" for i in range(5)]  # Assuming there are always 5 columns

# Find the table data in the text
data_match = data_pattern.search(text)

print(f'data_match: {data_match}')
if data_match:
    table_text = data_match.group(2)
    # Split the rows
    rows = table_text.strip().split('\n')

    # Prepare the data for the DataFrame
    data = []
    for row in rows:
        # Split the row by multiple spaces to separate the region and numeric values
        row_data = re.split(r'\s{2,}', row.strip())
        if len(row_data) == 5:
            data.append(row_data)
        else:
            # Handle cases where splitting might not have worked as expected
            # Try to split by first part being region and rest being numbers
            parts = row.rsplit(' ', 4)
            if len(parts) == 5:
                data.append(parts)

    # Create the DataFrame
    df = pd.DataFrame(data, columns=["1", "2", "3", "4", "5"])

    # Output the DataFrame
    df_output = df
else:
    df_output = None

df_output

headers: ['2022', '2023', '2022', '2023']
data_match: <re.Match object; span=(128, 673), match='2022 2023 2022 2023\nUnited States — — 40,000 40,>


Unnamed: 0,1,2,3,4,5
0,United States,—,—,40000,40000
1,United States and Canada,60000,60000,—,—
2,Australia,50000,50000,—,—
3,Austria,90000,90000,—,—
4,Brazil,50000,50000,40000,40000
5,China,800000,800000,450000,450000
6,France,40000,40000,20000,20000
7,Germany,80000,80000,35000,35000
8,India,40000,40000,5000,5000
9,Japan,15000,15000,60000,60000


In [47]:
import re
import pandas as pd

# Sample text containing the table
text = """
: Fused aluminum oxide capacity data for Austria were revised based on company reports.
Fused aluminum oxidee Silicon carbidee
2022 2023 2022 2023
United States — — 40,000 40,000
United States and Canada 60,000 60,000 — —
Australia 50,000 50,000 — —
Austria 90,000 90,000 — —
Brazil 50,000 50,000 40,000 40,000
China 800,000 800,000 450,000 450,000
France 40,000 40,000 20,000 20,000
Germany 80,000 80,000 35,000 35,000
India 40,000 40,000 5,000 5,000
Japan 15,000 15,000 60,000 60,000
Mexico — — 45,000 45,000
Norway — — 80,000 80,000
Venezuela — — 30,000 30,000
Other countries 80,000 80,000 200,000 200,000
World total (rounded) 1,310,000 1,300,000 1,000,000 1,000,000
"""

# Regular expression to match the table header and data
header_pattern = re.compile(r'(\w+[\w\s]*\n)+\d{4} \d{4} \d{4} \d{4}\n')
data_pattern = re.compile(r'(\d{4} \d{4} \d{4} \d{4}\n)(.*?)(?=\n\n|\Z)', re.DOTALL)

# Find the header in the text
header_match = header_pattern.search(text)
if header_match:
    header_text = header_match.group(0)
    # Extract potential headers from the matched text
    headers = header_text.strip().split('\n')[-1].split()
    print(f'headers: {headers}')
else:
    headers = []

# Generate column names
if not headers:
    headers = [f"Col {i+1}" for i in range(5)]  # Assuming there are always 5 columns

# Find the table data in the text
data_match = data_pattern.search(text)

if data_match:
    table_text = data_match.group(2)
    # Split the rows
    rows = table_text.strip().split('\n')

    # Prepare the data for the DataFrame
    data = []
    for row in rows:
        # Split the row by multiple spaces to separate the region and numeric values
        row_data = re.split(r'\s{2,}', row.strip())
        if len(row_data) == 5:
            data.append(row_data)
        else:
            # Handle cases where splitting might not have worked as expected
            # Try to split by first part being region and rest being numbers
            parts = row.rsplit(' ', 4)
            if len(parts) == 5:
                data.append(parts)

    # Create the DataFrame
    df = pd.DataFrame(data, columns=["Region"] + headers[:4])

    # Output the DataFrame
    df_output = df
else:
    df_output = None

df_output

headers: ['2022', '2023', '2022', '2023']


Unnamed: 0,Region,2022,2023,2022.1,2023.1
0,United States,—,—,40000,40000
1,United States and Canada,60000,60000,—,—
2,Australia,50000,50000,—,—
3,Austria,90000,90000,—,—
4,Brazil,50000,50000,40000,40000
5,China,800000,800000,450000,450000
6,France,40000,40000,20000,20000
7,Germany,80000,80000,35000,35000
8,India,40000,40000,5000,5000
9,Japan,15000,15000,60000,60000


## Best version but not really generalized

In [73]:
import re
import pandas as pd

# Sample text containing the table
text = """
: Fused aluminum oxide capacity data for Austria were revised based on company reports.
Fused aluminum oxidee Silicon carbidee
2022 2023 2022 2023
United States — — 40,000 40,000
United States and Canada 60,000 60,000 — —
Australia 50,000 50,000 — —
Austria 90,000 90,000 — —
Brazil 50,000 50,000 40,000 40,000
China 800,000 800,000 450,000 450,000
France 40,000 40,000 20,000 20,000
Germany 80,000 80,000 35,000 35,000
India 40,000 40,000 5,000 5,000
Japan 15,000 15,000 60,000 60,000
Mexico — — 45,000 45,000
Norway — — 80,000 80,000
Venezuela — — 30,000 30,000
Other countries 80,000 80,000 200,000 200,000
World total (rounded) 1,310,000 1,300,000 1,000,000 1,000,000
"""

# Regular expression to match the table header and data
header_pattern = re.compile(r'(.*\n){3}\d{4} \d{4} \d{4} \d{4}\n')
data_pattern = re.compile(r'(\d{4} \d{4} \d{4} \d{4}\n)(.*?)(?=\n\n|\Z)', re.DOTALL)

# Find the header in the text
header_match = header_pattern.search(text)
if header_match:
    header_lines = header_match.group(0).strip().split('\n')
    # Extract the two header lines
    headers_1 = header_lines[1].split(' ', 3)
    headers_2 = header_lines[2].split()
    # Manually combine the correct parts of the header lines
    headers = ["Region"]
    for i in range(len(headers_2)):
        if i < 2:
            headers.append(f"{headers_1[0]} {headers_1[1]} {headers_1[2]} {headers_2[i]}")
        else:
            headers.append(f"{headers_1[3]} {headers_2[i]}")
else:
    headers = []

# Find the table data in the text
data_match = data_pattern.search(text)

if data_match:
    table_text = data_match.group(2)
    # Split the rows
    rows = table_text.strip().split('\n')

    # Prepare the data for the DataFrame
    data = []
    for row in rows:
        # Split the row by multiple spaces to separate the region and numeric values
        row_data = re.split(r'\s{2,}', row.strip())
        if len(row_data) == 5:
            data.append(row_data)
        else:
            # Handle cases where splitting might not have worked as expected
            # Try to split by first part being region and rest being numbers
            parts = row.rsplit(' ', 4)
            if len(parts) == 5:
                data.append(parts)

    # Create the DataFrame
    df = pd.DataFrame(data, columns=headers)

    # Output the DataFrame
    df_output = df
else:
    df_output = None

df_output

Unnamed: 0,Region,Fused aluminum oxidee 2022,Fused aluminum oxidee 2023,Silicon carbidee 2022,Silicon carbidee 2023
0,United States,—,—,40000,40000
1,United States and Canada,60000,60000,—,—
2,Australia,50000,50000,—,—
3,Austria,90000,90000,—,—
4,Brazil,50000,50000,40000,40000
5,China,800000,800000,450000,450000
6,France,40000,40000,20000,20000
7,Germany,80000,80000,35000,35000
8,India,40000,40000,5000,5000
9,Japan,15000,15000,60000,60000


## Test test

In [97]:
print(relevant_text)


: Capacity data for China and the United States were revised based on
company and Government reports.
Smelter production Yearend capacity
2022 2023e 2022 2023e
United States 861 750 1,640 1,360
Australia 1,510 1,500 1,730 1,730
Bahrain 1,600 1,600 1,600 1,600
Brazil 811 1,100 1,280 1,280
Canada 2,770 3,000 3,270 3,270
China 40,200 41,000 44,300 45,000
Iceland e720 730 880 880
India e4,100 4,100 4,060 4,060
Malaysia e900 980 1,080 1,080
Norway e1,400 1,300 1,460 1,460
Russia 3,720 3,800 4,080 4,080
United Arab Emirates 2,650 2,700 2,790 2,790
Other countries 7,110 7,000 10,300 10,000
World total (rounded) 68,400 70,000 78,500 79,000



In [96]:
import re
import pandas as pd

# Sample text containing the table
text = relevant_text
print(f'▶️ text: {text}')


# Regular expression to match the table header and data
header_pattern = re.compile(r'(.*\n){3}\d{4} \d{4} \d{4} \d{4}\n')
data_pattern = re.compile(r'(\d{4} \d{4} \d{4} \d{4}\n)(.*?)(?=\n\n|\Z)', re.DOTALL)

# Find the header in the text
header_match = header_pattern.search(text)
print(f' ▶️ header_match: {header_match}')
if header_match:
    header_lines = header_match.group(0).strip().split('\n')
    print(f' ▶️ header_lines: {header_lines}')
    # Extract the two header lines
    headers_1 = header_lines[1].split(' ', 3)
    headers_2 = header_lines[2].split()
    # Manually combine the correct parts of the header lines
    headers = ["Region"]
    for i in range(len(headers_2)):
        if i < 2:
            headers.append(f"{headers_1[0]} {headers_1[1]} {headers_1[2]} {headers_2[i]}")
        else:
            headers.append(f"{headers_1[3]} {headers_2[i]}")
    
    print(f' ▶️ headers: {headers}')
else:
    headers = []

# Find the table data in the text
data_match = data_pattern.search(text)

if data_match:
    table_text = data_match.group(2)
    # Split the rows
    rows = table_text.strip().split('\n')

    # Prepare the data for the DataFrame
    data = []
    for row in rows:
        # Split the row by multiple spaces to separate the region and numeric values
        row_data = re.split(r'\s{2,}', row.strip())
        if len(row_data) == 5:
            data.append(row_data)
        else:
            # Handle cases where splitting might not have worked as expected
            # Try to split by first part being region and rest being numbers
            parts = row.rsplit(' ', 4)
            if len(parts) == 5:
                data.append(parts)

    # Create the DataFrame
    df = pd.DataFrame(data, columns=headers)

    # Output the DataFrame
    df_output = df
else:
    df_output = None

df_output

▶️ text: 
: Capacity data for China and the United States were revised based on
company and Government reports.
Smelter production Yearend capacity
2022 2023e 2022 2023e
United States 861 750 1,640 1,360
Australia 1,510 1,500 1,730 1,730
Bahrain 1,600 1,600 1,600 1,600
Brazil 811 1,100 1,280 1,280
Canada 2,770 3,000 3,270 3,270
China 40,200 41,000 44,300 45,000
Iceland e720 730 880 880
India e4,100 4,100 4,060 4,060
Malaysia e900 980 1,080 1,080
Norway e1,400 1,300 1,460 1,460
Russia 3,720 3,800 4,080 4,080
United Arab Emirates 2,650 2,700 2,790 2,790
Other countries 7,110 7,000 10,300 10,000
World total (rounded) 68,400 70,000 78,500 79,000

 ▶️ header_match: None


## test a more flexible regex search

In [120]:
relevant_text

'\n:\nRefinery productione Reserves9\n2022 2023\nUnited States1 10212 220 Quantitative estimates of reserves\nAustralia 10328 380 were not available. The cadmium\nBulgaria 340 340 content of typical zinc ores\nCanada 1,800 1,800 averages about 0.03%. See the\nChina 8,700 9,000 Zinc chapter for zinc reserves.\nGermany 320 —\nJapan 1,800 1,800\nKazakhstan 1,000 1,000\nKorea, Republic of 4,000 4,000\nMexico 101,170 1,100\nNetherlands 574 750\nNorway 420 380\nPeru 10460 790\nPoland 250 230\nRussia 1,000 1,000\nUzbekistan 220 220\nWorld total (rounded) 22,600 23,000\n'

In [116]:
import re
import pandas as pd

# Sample texts
text1 = """
: Fused aluminum oxide capacity data for Austria were revised based on company reports.
Fused aluminum oxidee Silicon carbidee
2022 2023 2022 2023
United States — — 40,000 40,000
United States and Canada 60,000 60,000 — —
Australia 50,000 50,000 — —
Austria 90,000 90,000 — —
Brazil 50,000 50,000 40,000 40,000
China 800,000 800,000 450,000 450,000
France 40,000 40,000 20,000 20,000
Germany 80,000 80,000 35,000 35,000
India 40,000 40,000 5,000 5,000
Japan 15,000 15,000 60,000 60,000
Mexico — — 45,000 45,000
Norway — — 80,000 80,000
Venezuela — — 30,000 30,000
Other countries 80,000 80,000 200,000 200,000
World total (rounded) 1,310,000 1,300,000 1,000,000 1,000,000
"""

text2 = """
: Capacity data for China and the United States were revised based on
company and Government reports.
Smelter production Yearend capacity
2022 2023e 2022 2023e
United States 861 750 1,640 1,360
Australia 1,510 1,500 1,730 1,730
Bahrain 1,600 1,600 1,600 1,600
Brazil 811 1,100 1,280 1,280
Canada 2,770 3,000 3,270 3,270
China 40,200 41,000 44,300 45,000
Iceland e720 730 880 880
India e4,100 4,100 4,060 4,060
Malaysia e900 980 1,080 1,080
Norway e1,400 1,300 1,460 1,460
Russia 3,720 3,800 4,080 4,080
United Arab Emirates 2,650 2,700 2,790 2,790
Other countries 7,110 7,000 10,300 10,000
World total (rounded) 68,400 70,000 78,500 79,000
"""

text3 = ''':
Cement productione Clinker capacitye
2022 2023 2022 2023
United States (includes Puerto Rico) 93,000 91,000 100,000 100,000
Brazil 64,000 63,000 60,000 60,000
China 2,100,000 2,100,000 2,000,000 2,000,000
Egypt 46,000 50,000 60,000 60,000
India 380,000 410,000 290,000 300,000
Indonesia 64,000 62,000 79,000 79,000
Iran 59,000 65,000 81,000 81,000
Japan 53,000 50,000 54,000 54,000
Korea, Republic of 51,000 50,000 62,000 62,000
Mexico 50,000 50,000 42,000 42,000
Russia 61,000 57,000 80,000 80,000
Saudi Arabia 52,000 53,000 75,000 75,000
Turkey 74,000 79,000 92,000 92,000
Vietnam 120,000 110,000 100,000 110,000
Other countries (rounded) 850,000 850,000 600,000 600,000
World total (rounded) 4,100,000 4,100,000 3,800,000 3,800,000'''

text4 = ''':
Refinery productione Reserves9
2022 2023
United States1 10212 220 Quantitative estimates of reserves
Australia 10328 380 were not available. The cadmium
Bulgaria 340 340 content of typical zinc ores
Canada 1,800 1,800 averages about 0.03%. See the
China 8,700 9,000 Zinc chapter for zinc reserves.
Germany 320 —
Japan 1,800 1,800
Kazakhstan 1,000 1,000
Korea, Republic of 4,000 4,000
Mexico 101,170 1,100
Netherlands 574 750
Norway 420 380
Peru 10460 790
Poland 250 230
Russia 1,000 1,000
Uzbekistan 220 220
World total (rounded) 22,600 23,000'''

def process_text(text):
    # Regular expression to match the table header and data
    header_pattern = re.compile(r'(.*\n){2,3}\d{4} \d{4}.*\d{4} \d{4}[a-zA-Z]*\n')
    data_pattern = re.compile(r'(\d{4} \d{4}.*\d{4} \d{4}[a-zA-Z]*\n)(.*?)(?=\n\n|\Z)', re.DOTALL)

    # Find the header in the text
    header_match = header_pattern.search(text)
    if header_match:
        header_lines = header_match.group(0).strip().split('\n')
        print(f"header_lines: {header_lines}")
        # Extract the two header lines
        headers_1 = header_lines[-2].split(' ', 3)
        headers_2 = header_lines[-1].split()
        # Manually combine the correct parts of the header lines
        headers = ["Region"]
        for i in range(len(headers_2)):
            if i < len(headers_1):
                headers.append(f"{headers_1[i]} {headers_2[i]}")
            else:
                headers.append(f"{headers_1[-1]} {headers_2[i]}")
    else:
        headers = []

    # Find the table data in the text
    data_match = data_pattern.search(text)

    if data_match:
        table_text = data_match.group(2)
        # Split the rows
        rows = table_text.strip().split('\n')

        # Prepare the data for the DataFrame
        data = []
        for row in rows:
            # Split the row by multiple spaces to separate the region and numeric values
            row_data = re.split(r'\s{2,}', row.strip())
            if len(row_data) == 5:
                data.append(row_data)
            else:
                # Handle cases where splitting might not have worked as expected
                # Try to split by first part being region and rest being numbers
                parts = row.rsplit(' ', 4)
                if len(parts) == 5:
                    data.append(parts)

        # Create the DataFrame
        df = pd.DataFrame(data, columns=headers)

        # Output the DataFrame
        return df
    else:
        return None

# Process both texts
df_output1 = process_text(text1)
df_output2 = process_text(text2)
df_output3 = process_text(text3)
df_output4 = process_text(text4)

# Display the DataFrames
print("DataFrame 1:")
df_output1

header_lines: [': Fused aluminum oxide capacity data for Austria were revised based on company reports.', 'Fused aluminum oxidee Silicon carbidee', '2022 2023 2022 2023']
header_lines: [': Capacity data for China and the United States were revised based on', 'company and Government reports.', 'Smelter production Yearend capacity', '2022 2023e 2022 2023e']
header_lines: [':', 'Cement productione Clinker capacitye', '2022 2023 2022 2023']
DataFrame 1:


Unnamed: 0,Region,Fused 2022,aluminum 2023,oxidee 2022,Silicon carbidee 2023
0,United States,—,—,40000,40000
1,United States and Canada,60000,60000,—,—
2,Australia,50000,50000,—,—
3,Austria,90000,90000,—,—
4,Brazil,50000,50000,40000,40000
5,China,800000,800000,450000,450000
6,France,40000,40000,20000,20000
7,Germany,80000,80000,35000,35000
8,India,40000,40000,5000,5000
9,Japan,15000,15000,60000,60000


In [100]:
print("\nDataFrame 2:")
df_output2


DataFrame 2:


Unnamed: 0,Region,Smelter 2022,production 2023e,Yearend 2022,capacity 2023e
0,United States,861,750,1640,1360
1,Australia,1510,1500,1730,1730
2,Bahrain,1600,1600,1600,1600
3,Brazil,811,1100,1280,1280
4,Canada,2770,3000,3270,3270
5,China,40200,41000,44300,45000
6,Iceland,e720,730,880,880
7,India,"e4,100",4100,4060,4060
8,Malaysia,e900,980,1080,1080
9,Norway,"e1,400",1300,1460,1460


In [110]:
print("DataFrame 3:")
df_output3

DataFrame 3:


Unnamed: 0,Region,Cement 2022,productione 2023,Clinker 2022,capacitye 2023
0,United States (includes Puerto Rico),93000,91000,100000,100000
1,Brazil,64000,63000,60000,60000
2,China,2100000,2100000,2000000,2000000
3,Egypt,46000,50000,60000,60000
4,India,380000,410000,290000,300000
5,Indonesia,64000,62000,79000,79000
6,Iran,59000,65000,81000,81000
7,Japan,53000,50000,54000,54000
8,"Korea, Republic of",51000,50000,62000,62000
9,Mexico,50000,50000,42000,42000


In [117]:
# Display the DataFrames
print("DataFrame 4:")
df_output4

DataFrame 4:


In [119]:
# import re
# import pandas as pd

# # Sample texts
# text1 = """
# : Fused aluminum oxide capacity data for Austria were revised based on company reports.
# Fused aluminum oxidee Silicon carbidee
# 2022 2023 2022 2023
# United States — — 40,000 40,000
# United States and Canada 60,000 60,000 — —
# Australia 50,000 50,000 — —
# Austria 90,000 90,000 — —
# Brazil 50,000 50,000 40,000 40,000
# China 800,000 800,000 450,000 450,000
# France 40,000 40,000 20,000 20,000
# Germany 80,000 80,000 35,000 35,000
# India 40,000 40,000 5,000 5,000
# Japan 15,000 15,000 60,000 60,000
# Mexico — — 45,000 45,000
# Norway — — 80,000 80,000
# Venezuela — — 30,000 30,000
# Other countries 80,000 80,000 200,000 200,000
# World total (rounded) 1,310,000 1,300,000 1,000,000 1,000,000
# """

# text2 = ''':
# Refinery productione Reserves9
# 2022 2023
# United States1 10212 220 Quantitative estimates of reserves
# Australia 10328 380 were not available. The cadmium
# Bulgaria 340 340 content of typical zinc ores
# Canada 1,800 1,800 averages about 0.03%. See the
# China 8,700 9,000 Zinc chapter for zinc reserves.
# Germany 320 —
# Japan 1,800 1,800
# Kazakhstan 1,000 1,000
# Korea, Republic of 4,000 4,000
# Mexico 101,170 1,100
# Netherlands 574 750
# Norway 420 380
# Peru 10460 790
# Poland 250 230
# Russia 1,000 1,000
# Uzbekistan 220 220
# World total (rounded) 22,600 23,000'''

# def process_text(text):
#     # Regular expression to match the table header and data
#     header_pattern = re.compile(r'(.+\n){2,3}\d{4} \d{4}[^\n]*\n')
#     data_pattern = re.compile(r'(\d{4} \d{4}[^\n]*\n)(.*?)(?=\n\n|\Z)', re.DOTALL)

#     # Find the header in the text
#     header_match = header_pattern.search(text)
#     if header_match:
#         header_lines = header_match.group(0).strip().split('\n')
#         # Extract the two header lines
#         headers_1 = header_lines[-2].split()
#         headers_2 = header_lines[-1].split()
#         # Manually combine the correct parts of the header lines
#         headers = ["Region"]
#         combined_headers = [f"{headers_1[i]} {headers_2[i]}" for i in range(len(headers_2))]
#         headers.extend(combined_headers)
#     else:
#         headers = []

#     # Find the table data in the text
#     data_match = data_pattern.search(text)

#     if data_match:
#         table_text = data_match.group(2)
#         # Split the rows
#         rows = table_text.strip().split('\n')

#         # Prepare the data for the DataFrame
#         data = []
#         for row in rows:
#             # Split the row by multiple spaces to separate the region and numeric values
#             row_data = re.split(r'\s{2,}', row.strip())
#             if len(row_data) == len(headers):
#                 data.append(row_data)
#             else:
#                 # Handle cases where splitting might not have worked as expected
#                 # Try to split by first part being region and rest being numbers
#                 parts = row.rsplit(' ', len(headers) - 1)
#                 if len(parts) == len(headers):
#                     data.append(parts)

#         # Create the DataFrame
#         df = pd.DataFrame(data, columns=headers)

#         # Output the DataFrame
#         return df
#     else:
#         return None

# # Process both texts
# df_output1 = process_text(text1)
# df_output2 = process_text(text2)

# # Display the DataFrames
# # print("DataFrame 1:")
# # print(df_output1)
# print("\nDataFrame 2:")
# df_output2


DataFrame 2:


Unnamed: 0,Region,Refinery 2022,productione 2023
0,United States1 10212 220 Quantitative estimates,of,reserves
1,Australia 10328 380 were not available.,The,cadmium
2,Bulgaria 340 340 content of typical,zinc,ores
3,"Canada 1,800 1,800 averages about 0.03%.",See,the
4,"China 8,700 9,000 Zinc chapter for",zinc,reserves.
5,Germany,320,—
6,Japan,1800,1800
7,Kazakhstan,1000,1000
8,"Korea, Republic of",4000,4000
9,Mexico,101170,1100
