In [1]:
import fitz  # PyMuPDF

# Define the correct path to the PDF file
pdf_path = "mcs2024.pdf"

# Open the PDF file and extract text from page 33
with fitz.open(pdf_path) as pdf:
    page = pdf[0]  # Page 33 is index 32
    text = page.get_text()

text_list = text.split('\n')

In [2]:
text_list

['U.S. Department of the Interior',
 'U.S. Geological Survey',
 'MINERAL COMMODITY ',
 'SUMMARIES 2024',
 'Silicon',
 'Silver',
 'Soda Ash',
 'Stone',
 'Strontium',
 'Sulfur',
 'Talc',
 'Tantalum',
 'Tellurium',
 'Thallium',
 'Thorium',
 'Tin',
 'Titanium',
 'Tungsten',
 'Vanadium',
 'Vermiculite',
 'Wollastonite',
 'Yttrium',
 'Zeolites',
 'Zinc',
 'Zirconium',
 'Mercury',
 'Mica',
 'Molybdenum',
 'Nickel',
 'Niobium',
 'Nitrogen',
 'Palladium',
 'Peat',
 'Perlite',
 'Phosphate Rock',
 'Platinum',
 'Potash',
 'Pumice',
 'Quartz',
 'Rare Earths',
 'Rhenium',
 'Rubidium',
 'Salt',
 'Sand and Gravel',
 'Scandium',
 'Selenium',
 'Fluorspar',
 'Gallium',
 'Garnet',
 'Gemstones',
 'Germanium',
 'Gold',
 'Graphite',
 'Gypsum',
 'Hafnium',
 'Helium',
 'Indium',
 'Iodine',
 'Iron and Steel',
 'Iron Ore',
 'Iron Oxide Pigments',
 'Kyanite',
 'Lead',
 'Lime',
 'Lithium',
 'Magnesium',
 'Manganese',
 'Abrasives',
 'Aluminum',
 'Antimony',
 'Arsenic',
 'Asbestos',
 'Barite',
 'Bauxite',
 'Berylliu

In [3]:
import pdfplumber

def extract_word_positions(pdf_path, i):
    word_positions = []
    
    with pdfplumber.open(pdf_path) as pdf:
        # Only process page at index 33 (which is page 34)
        page = pdf.pages[i]
        words = page.extract_words()
        for word in words:
            word_info = {
                'text': word['text'],
                'x0': word['x0'],
                'top': word['top'],
                'x1': word['x1'],
                'bottom': word['bottom'],
                'page_number': i  # since we are only dealing with page 34
            }
            word_positions.append(word_info)
    
    return word_positions

def group_words_by_sequence(word_positions):
    sequences = []
    current_sequence = {'size': None, 'words': []}
    
    for word in word_positions:
        size = round(word['bottom'] - word['top'], 2)
        
        if current_sequence['size'] is None:
            current_sequence['size'] = size
        
        if size == current_sequence['size']:
            current_sequence['words'].append(word['text'])
        else:
            sequences.append(current_sequence)
            current_sequence = {'size': size, 'words': [word['text']]}
    
    if current_sequence['words']:
        sequences.append(current_sequence)
    
    return sequences

In [4]:
pdf_path = 'mcs2024.pdf'
positions = extract_word_positions(pdf_path, 0)
sequences = group_words_by_sequence(positions)

# Display the grouped word sequences
for i, seq in enumerate(sequences):
    print(f"Group {i+1} - Words with size {seq['size']}: {seq['words']}")

Group 1 - Words with size 12.0: ['U.S.', 'Department', 'of', 'the', 'Interior', 'U.S.', 'Geological', 'Survey']
Group 2 - Words with size 36.0: ['MINERAL', 'COMMODITY', 'SUMMARIES', '2024']
Group 3 - Words with size 12.0: ['Abrasives', 'Fluorspar', 'Mercury', 'Silicon', 'Aluminum', 'Gallium', 'Mica', 'Silver', 'Antimony', 'Garnet', 'Molybdenum', 'Soda', 'Ash', 'Arsenic', 'Gemstones', 'Nickel', 'Stone', 'Asbestos', 'Germanium', 'Niobium', 'Strontium', 'Barite', 'Gold', 'Nitrogen', 'Sulfur', 'Bauxite', 'Graphite', 'Palladium', 'Talc', 'Beryllium', 'Gypsum', 'Peat', 'Tantalum', 'Bismuth', 'Hafnium', 'Perlite', 'Tellurium', 'Boron', 'Helium', 'Phosphate', 'Rock', 'Thallium', 'Bromine', 'Indium', 'Platinum', 'Thorium', 'Cadmium', 'Iodine', 'Potash', 'Tin', 'Cement', 'Iron', 'and', 'Steel', 'Pumice', 'Titanium', 'Cesium', 'Iron', 'Ore', 'Quartz', 'Tungsten', 'Chromium', 'Iron', 'Oxide', 'Pigments', 'Rare', 'Earths', 'Vanadium', 'Clays', 'Kyanite', 'Rhenium', 'Vermiculite', 'Cobalt', 'Lead', 

In [5]:
materials = sequences[2]['words']

In [6]:
len(materials)

94

In [7]:
materials

['Abrasives',
 'Fluorspar',
 'Mercury',
 'Silicon',
 'Aluminum',
 'Gallium',
 'Mica',
 'Silver',
 'Antimony',
 'Garnet',
 'Molybdenum',
 'Soda',
 'Ash',
 'Arsenic',
 'Gemstones',
 'Nickel',
 'Stone',
 'Asbestos',
 'Germanium',
 'Niobium',
 'Strontium',
 'Barite',
 'Gold',
 'Nitrogen',
 'Sulfur',
 'Bauxite',
 'Graphite',
 'Palladium',
 'Talc',
 'Beryllium',
 'Gypsum',
 'Peat',
 'Tantalum',
 'Bismuth',
 'Hafnium',
 'Perlite',
 'Tellurium',
 'Boron',
 'Helium',
 'Phosphate',
 'Rock',
 'Thallium',
 'Bromine',
 'Indium',
 'Platinum',
 'Thorium',
 'Cadmium',
 'Iodine',
 'Potash',
 'Tin',
 'Cement',
 'Iron',
 'and',
 'Steel',
 'Pumice',
 'Titanium',
 'Cesium',
 'Iron',
 'Ore',
 'Quartz',
 'Tungsten',
 'Chromium',
 'Iron',
 'Oxide',
 'Pigments',
 'Rare',
 'Earths',
 'Vanadium',
 'Clays',
 'Kyanite',
 'Rhenium',
 'Vermiculite',
 'Cobalt',
 'Lead',
 'Rubidium',
 'Wollastonite',
 'Copper',
 'Lime',
 'Salt',
 'Yttrium',
 'Diamond',
 'Lithium',
 'Sand',
 'and',
 'Gravel',
 'Zeolites',
 'Diatomite',

In [8]:
## split data into parts (2 pages generally!)

# spot the title:


In [17]:
import pdfplumber
import re

# Liste des éléments à rechercher
# elements = ['Abrasives',
#  'Fluorspar',
#  'Mercury',
#  'Silicon',
#  'Aluminum',
#  'Gallium',
#  'Mica',
#  'Silver',
#  'Antimony',
#  'Garnet',
#  'Molybdenum',
#  'Soda',
#  'Ash',
#  'Arsenic',
#  'Gemstones',
#  'Nickel',
#  'Stone',
#  'Asbestos',
#  'Germanium',
#  'Niobium',
#  'Strontium',
#  'Barite',
#  'Gold',
#  'Nitrogen',
#  'Sulfur',] 

elements = materials

def extract_largest_text(page):
    words = page.extract_words()
    if not words:
        return ""
    
    # Trouver la taille de police la plus grande
    largest_size = max(words, key=lambda w: (w['bottom'] - w['top']))['bottom'] - max(words, key=lambda w: (w['bottom'] - w['top']))['top']
    
    # Récupérer les mots ayant la plus grande taille de police
    largest_words = [word['text'] for word in words if (word['bottom'] - word['top']) == largest_size]
    
    # Combiner les mots en une seule chaîne de texte
    largest_text = ' '.join(largest_words)
    return largest_text

# def match_element_in_text(elements, text):
#     text = text.lower()  # Convertir le texte en minuscule pour correspondre de manière insensible à la casse
#     for element in elements:
#         # Créer une expression régulière pour correspondre à l'élément avec des préfixes et suffixes possibles
#         regex = r'(\b\w*\s*)?' + re.escape(element.lower()) + r'(\s*\w*\b)?'
#         if re.search(regex, text):
#             return element
#     return None

def match_element_in_text(elements, text):
    text = text.lower()  # Convertir le texte en minuscule pour correspondre de manière insensible à la casse
    for element in elements:
        # Créer une expression régulière pour correspondre à l'élément, permettant des préfixes et suffixes
        regex = re.compile(re.escape(element.lower()), re.IGNORECASE)
        if regex.search(text):
            return element
    return None

In [27]:
# Ouvrir le PDF
with pdfplumber.open("mcs2024.pdf") as pdf:
    # Dictionnaire pour stocker les pages correspondant à chaque élément
    pdf_sections = {element: [] for element in elements}
    
    # Parcourir chaque page du PDF
    for i in range(33, 45):
        page = pdf.pages[i]
        largest_text = extract_largest_text(page)
        # print(f'▶️ largest_text: {largest_text}')
        
        # Vérifier si l'un des éléments est dans le texte de la page
        matched_element = match_element_in_text(elements, largest_text)
        if matched_element:
            pdf_sections[matched_element].append(i)  # Les pages sont indexées à partir de 1 dans les PDF

# Afficher les résultats
for element, pages in pdf_sections.items():
    if pages:
        print(f"Element: {element}, Pages: {pages}")

Element: Abrasives, Pages: [33, 34]
Element: Aluminum, Pages: [35, 36]
Element: Antimony, Pages: [37, 38]
Element: Arsenic, Pages: [39, 40]
Element: Asbestos, Pages: [41, 42]
Element: Barite, Pages: [43, 44]


In [25]:
pdf_sections

{'Abrasives': [33, 34],
 'Fluorspar': [],
 'Mercury': [],
 'Silicon': [],
 'Aluminum': [35, 36],
 'Gallium': [],
 'Mica': [],
 'Silver': [],
 'Antimony': [37, 38],
 'Garnet': [],
 'Molybdenum': [],
 'Soda': [],
 'Ash': [],
 'Arsenic': [39, 40],
 'Gemstones': [],
 'Nickel': [],
 'Stone': [],
 'Asbestos': [41, 42],
 'Germanium': [],
 'Niobium': [],
 'Strontium': [],
 'Barite': [43, 44],
 'Gold': [],
 'Nitrogen': [],
 'Sulfur': [],
 'Bauxite': [46],
 'Graphite': [],
 'Palladium': [],
 'Talc': [],
 'Beryllium': [47, 48],
 'Gypsum': [],
 'Peat': [],
 'Tantalum': [],
 'Bismuth': [49, 50],
 'Hafnium': [],
 'Perlite': [],
 'Tellurium': [],
 'Boron': [51, 52],
 'Helium': [],
 'Phosphate': [],
 'Rock': [],
 'Thallium': [],
 'Bromine': [53, 54],
 'Indium': [],
 'Platinum': [],
 'Thorium': [],
 'Cadmium': [55, 56],
 'Iodine': [],
 'Potash': [],
 'Tin': [],
 'Cement': [57, 58],
 'Iron': [],
 'and': [],
 'Steel': [],
 'Pumice': [],
 'Titanium': [],
 'Cesium': [59],
 'Ore': [],
 'Quartz': [],
 'Tungs

In [26]:
# nombre_de_listes_vides = sum(1 for pages in pdf_sections.values() if not pages)
# nombre_de_listes_vides

77

In [29]:
pdf_restric = {key: value for key, value in pdf_sections.items() if value}

pdf_restric

{'Abrasives': [33, 34],
 'Aluminum': [35, 36],
 'Antimony': [37, 38],
 'Arsenic': [39, 40],
 'Asbestos': [41, 42],
 'Barite': [43, 44]}

In [30]:
import json

# Define the filename for the JSON file
json_filename = 'pdf_restric.json'

# Write the pdf_restric dictionary to a JSON file
with open(json_filename, 'w') as json_file:
    json.dump(pdf_restric, json_file, indent=4)

print(f"Data saved to {json_filename}")

Data saved to pdf_restric.json
