In [4]:
grobid_server = "https://kermitt2-grobid.hf.space"

In [5]:
input_path = "../data/raw"  # Replace with the actual folder path
output_path = "../data/interim/"  # Replace with the desired output folder path

import os
files = []

for filename in os.listdir(input_path):
    file_path = os.path.join(input_path, filename)
    if os.path.isfile(file_path):
        # Add the file and its size to the list
        files.append((file_path, os.path.getsize(file_path)))

In [6]:
import subprocess
import time
import random
import os
from tqdm import tqdm 

# Sort files by size (smallest first)
files.sort(key=lambda x: x[1])
random.shuffle(files)

# Process each file in order of size com tqdm
for file_path, file_size in tqdm(files, desc="Processando arquivos", unit="file"):
    # Skip processing if the file size is zero
    if file_size == 0:
        tqdm.write(f"Skipped {file_path} (file size is zero)")
        continue

    # Extract the filename from the path
    filename = os.path.basename(file_path)

    # Prepare the output filename
    output_filename = f"{filename}.grobid.tei.xml"
    output_file_path = os.path.join(output_path, output_filename)

    # Check if the output file exists and its size is greater than 0
    if not os.path.exists(output_file_path) or os.path.getsize(output_file_path) == 0:
        # Prepare the curl command
        tqdm.write(f"Trying to extract {file_path} from PDF to TEI...")
           
        curl_command = [
            "curl", "-v", 
            "-H", "Accept: application/xml", 
            "--form", "consolidateCitations=1", 
            "--form", "includeRawCitations=1", 
            "--form", "segmentSentences=1",
            "--form", "generateIDs=1", 
            "--form", f"input=@{file_path}", 
            f"{grobid_server}/api/processFulltextDocument",
        ]

        # Open the output file to redirect the curl output into it
        with open(output_file_path, "w") as output_file:
            try:
                # Run the process with a timeout of 10 minutes (600 seconds)
                result = subprocess.run(curl_command, stdout=output_file, stderr=subprocess.PIPE, timeout=600)
                #if result.stderr:
                #    tqdm.write("STDERR:")
                #    tqdm.write(result.stderr.decode())

            except subprocess.TimeoutExpired:
                tqdm.write("The process did not finish within 10 minutes and was terminated.")

        # Check if the output file size is zero
        if os.path.getsize(output_file_path) == 0:
            tqdm.write(f"{output_file_path} is empty. NOT PROCESSED.")
        else:
            tqdm.write(f"Processed {filename} -> {output_file_path}")
    else:
        tqdm.write(f"Skipped {filename} (output file already exists and is non-empty)")

Processando arquivos:   0%|          | 0/21 [00:00<?, ?file/s]

Trying to extract ../data/raw/Entrepreneurship-in-transitional-economy_2011_.pdf from PDF to TEI...


Processando arquivos:   5%|▍         | 1/21 [00:21<07:19, 21.98s/file]

Processed Entrepreneurship-in-transitional-economy_2011_.pdf -> ../data/interim/Entrepreneurship-in-transitional-economy_2011_.pdf.grobid.tei.xml
Trying to extract ../data/raw/Ideology-and-the-international-economy-The-decline-and-fall-of-bretton-woods_2003_Palgrave-Macmillan.pdf from PDF to TEI...


Processando arquivos:  10%|▉         | 2/21 [03:50<41:46, 131.94s/file]

Processed Ideology-and-the-international-economy-The-decline-and-fall-of-bretton-woods_2003_Palgrave-Macmillan.pdf -> ../data/interim/Ideology-and-the-international-economy-The-decline-and-fall-of-bretton-woods_2003_Palgrave-Macmillan.pdf.grobid.tei.xml
Trying to extract ../data/raw/The-HarvardMIT-complexity-approach-to-development-and-Austrian-economics-Similarities-and-policy-implicati.pdf from PDF to TEI...


Processando arquivos:  14%|█▍        | 3/21 [04:30<26:55, 89.75s/file] 

Processed The-HarvardMIT-complexity-approach-to-development-and-Austrian-economics-Similarities-and-policy-implicati.pdf -> ../data/interim/The-HarvardMIT-complexity-approach-to-development-and-Austrian-economics-Similarities-and-policy-implicati.pdf.grobid.tei.xml
Trying to extract ../data/raw/Constructivism-individual-action-and-housing-finance-An-individualist-critique-of-approaches-to-housing-fi.pdf from PDF to TEI...


Processando arquivos:  19%|█▉        | 4/21 [04:48<17:22, 61.32s/file]

Processed Constructivism-individual-action-and-housing-finance-An-individualist-critique-of-approaches-to-housing-fi.pdf -> ../data/interim/Constructivism-individual-action-and-housing-finance-An-individualist-critique-of-approaches-to-housing-fi.pdf.grobid.tei.xml
Trying to extract ../data/raw/The-Perils-of-Regulating-COVID19-Insights-from-Kirznerian-Entrepreneurship-and-Ostromian-Polycentricity_20.pdf from PDF to TEI...


Processando arquivos:  24%|██▍       | 5/21 [05:12<12:45, 47.82s/file]

Processed The-Perils-of-Regulating-COVID19-Insights-from-Kirznerian-Entrepreneurship-and-Ostromian-Polycentricity_20.pdf -> ../data/interim/The-Perils-of-Regulating-COVID19-Insights-from-Kirznerian-Entrepreneurship-and-Ostromian-Polycentricity_20.pdf.grobid.tei.xml
Trying to extract ../data/raw/Conserving-Forests-in-Privatized-Commons-Trends-and-Management-Options-in-an-Ifugao-Village-Philippines_20.pdf from PDF to TEI...


Processando arquivos:  29%|██▊       | 6/21 [05:44<10:39, 42.63s/file]

Processed Conserving-Forests-in-Privatized-Commons-Trends-and-Management-Options-in-an-Ifugao-Village-Philippines_20.pdf -> ../data/interim/Conserving-Forests-in-Privatized-Commons-Trends-and-Management-Options-in-an-Ifugao-Village-Philippines_20.pdf.grobid.tei.xml
Trying to extract ../data/raw/Austrian-Economics-and-Compatibilist-Freedom_2024_Springer-Science-and-Business-Media-BV.pdf from PDF to TEI...


Processando arquivos:  33%|███▎      | 7/21 [06:22<09:34, 41.02s/file]

Processed Austrian-Economics-and-Compatibilist-Freedom_2024_Springer-Science-and-Business-Media-BV.pdf -> ../data/interim/Austrian-Economics-and-Compatibilist-Freedom_2024_Springer-Science-and-Business-Media-BV.pdf.grobid.tei.xml
Trying to extract ../data/raw/Motivational-Objects-in-Natural-Scenes-MONS-A-database-of-gt-800-objects_2017_Frontiers-Media-SA-infofront.pdf from PDF to TEI...


Processando arquivos:  38%|███▊      | 8/21 [06:49<07:55, 36.61s/file]

Processed Motivational-Objects-in-Natural-Scenes-MONS-A-database-of-gt-800-objects_2017_Frontiers-Media-SA-infofront.pdf -> ../data/interim/Motivational-Objects-in-Natural-Scenes-MONS-A-database-of-gt-800-objects_2017_Frontiers-Media-SA-infofront.pdf.grobid.tei.xml
Trying to extract ../data/raw/The-Trait-of-Extraversion-as-an-EnergyBased-Determinant-of-Entrepreneurs-SuccessThe-Case-of-Poland_2022_MD.pdf from PDF to TEI...


Processando arquivos:  43%|████▎     | 9/21 [07:20<06:56, 34.73s/file]

Processed The-Trait-of-Extraversion-as-an-EnergyBased-Determinant-of-Entrepreneurs-SuccessThe-Case-of-Poland_2022_MD.pdf -> ../data/interim/The-Trait-of-Extraversion-as-an-EnergyBased-Determinant-of-Entrepreneurs-SuccessThe-Case-of-Poland_2022_MD.pdf.grobid.tei.xml
Trying to extract ../data/raw/Otto-Neuraths-Scientific-Utopianism-RevisitedA-Refined-Model-for-Utopias-in-Thought-Experiments_2023_Sprin.pdf from PDF to TEI...


Processando arquivos:  48%|████▊     | 10/21 [07:53<06:16, 34.19s/file]

Processed Otto-Neuraths-Scientific-Utopianism-RevisitedA-Refined-Model-for-Utopias-in-Thought-Experiments_2023_Sprin.pdf -> ../data/interim/Otto-Neuraths-Scientific-Utopianism-RevisitedA-Refined-Model-for-Utopias-in-Thought-Experiments_2023_Sprin.pdf.grobid.tei.xml
Trying to extract ../data/raw/THE-CENTRAL-BANKING-SYSTEM-PARADOX_2023_Centre-of-Sociological-Research.pdf from PDF to TEI...


Processando arquivos:  52%|█████▏    | 11/21 [08:15<05:06, 30.69s/file]

Processed THE-CENTRAL-BANKING-SYSTEM-PARADOX_2023_Centre-of-Sociological-Research.pdf -> ../data/interim/THE-CENTRAL-BANKING-SYSTEM-PARADOX_2023_Centre-of-Sociological-Research.pdf.grobid.tei.xml
Trying to extract ../data/raw/Innovative-entrepreneurship-as-a-collaborative-effort-An-institutional-framework_2021_Now-Publishers-Inc.pdf from PDF to TEI...


Processando arquivos:  57%|█████▋    | 12/21 [09:34<06:46, 45.17s/file]

Processed Innovative-entrepreneurship-as-a-collaborative-effort-An-institutional-framework_2021_Now-Publishers-Inc.pdf -> ../data/interim/Innovative-entrepreneurship-as-a-collaborative-effort-An-institutional-framework_2021_Now-Publishers-Inc.pdf.grobid.tei.xml
Trying to extract ../data/raw/Entrepreneurship-research-mapping-intellectual-structures-and-research-trends_2019_Springer-Verlag-service.pdf from PDF to TEI...


Processando arquivos:  62%|██████▏   | 13/21 [10:31<06:30, 48.83s/file]

Processed Entrepreneurship-research-mapping-intellectual-structures-and-research-trends_2019_Springer-Verlag-service.pdf -> ../data/interim/Entrepreneurship-research-mapping-intellectual-structures-and-research-trends_2019_Springer-Verlag-service.pdf.grobid.tei.xml
Trying to extract ../data/raw/Austrian-persistence-Capitalbased-business-cycle-theory-and-the-dynamics-of-investment-spending_2006_.pdf from PDF to TEI...


Processando arquivos:  67%|██████▋   | 14/21 [10:51<04:41, 40.27s/file]

Processed Austrian-persistence-Capitalbased-business-cycle-theory-and-the-dynamics-of-investment-spending_2006_.pdf -> ../data/interim/Austrian-persistence-Capitalbased-business-cycle-theory-and-the-dynamics-of-investment-spending_2006_.pdf.grobid.tei.xml
Trying to extract ../data/raw/What-Can-Industrial-Policy-Do-Evidence-from-Singapore_2024_Springer (1).pdf from PDF to TEI...


Processando arquivos:  71%|███████▏  | 15/21 [11:32<04:01, 40.33s/file]

Processed What-Can-Industrial-Policy-Do-Evidence-from-Singapore_2024_Springer (1).pdf -> ../data/interim/What-Can-Industrial-Policy-Do-Evidence-from-Singapore_2024_Springer (1).pdf.grobid.tei.xml
Trying to extract ../data/raw/Editorial-Private-is-profit-and-the-public-is-dead_2018_Routledge-infotandfcouk.pdf from PDF to TEI...


Processando arquivos:  76%|███████▌  | 16/21 [11:40<02:33, 30.74s/file]

Processed Editorial-Private-is-profit-and-the-public-is-dead_2018_Routledge-infotandfcouk.pdf -> ../data/interim/Editorial-Private-is-profit-and-the-public-is-dead_2018_Routledge-infotandfcouk.pdf.grobid.tei.xml
Trying to extract ../data/raw/Demonstrated-risk-preferences-and-COVID19-regulations-in-the-United-States_2024_Springer.pdf from PDF to TEI...


Processando arquivos:  81%|████████  | 17/21 [12:03<01:53, 28.42s/file]

Processed Demonstrated-risk-preferences-and-COVID19-regulations-in-the-United-States_2024_Springer.pdf -> ../data/interim/Demonstrated-risk-preferences-and-COVID19-regulations-in-the-United-States_2024_Springer.pdf.grobid.tei.xml
Trying to extract ../data/raw/A-Historical-Intervention-in-the-Opportunity-Wars-Forgotten-Scholarship-the-DiscoveryCreation-Disruption-a.pdf from PDF to TEI...


Processando arquivos:  86%|████████▌ | 18/21 [12:41<01:33, 31.10s/file]

Processed A-Historical-Intervention-in-the-Opportunity-Wars-Forgotten-Scholarship-the-DiscoveryCreation-Disruption-a.pdf -> ../data/interim/A-Historical-Intervention-in-the-Opportunity-Wars-Forgotten-Scholarship-the-DiscoveryCreation-Disruption-a.pdf.grobid.tei.xml
Trying to extract ../data/raw/The-philosophical-contributions-of-Ludwig-von-Mises_1994_Kluwer-Academic-Publishers.pdf from PDF to TEI...


Processando arquivos:  90%|█████████ | 19/21 [12:45<00:46, 23.14s/file]

Processed The-philosophical-contributions-of-Ludwig-von-Mises_1994_Kluwer-Academic-Publishers.pdf -> ../data/interim/The-philosophical-contributions-of-Ludwig-von-Mises_1994_Kluwer-Academic-Publishers.pdf.grobid.tei.xml
Trying to extract ../data/raw/Editorial-Private-is-profit-and-the-public-is-dead_2018_Routledge-infotandfcouk (1).pdf from PDF to TEI...


Processando arquivos:  95%|█████████▌| 20/21 [12:52<00:18, 18.25s/file]

Processed Editorial-Private-is-profit-and-the-public-is-dead_2018_Routledge-infotandfcouk (1).pdf -> ../data/interim/Editorial-Private-is-profit-and-the-public-is-dead_2018_Routledge-infotandfcouk (1).pdf.grobid.tei.xml
Trying to extract ../data/raw/On-predictive-entrepreneurial-action-in-uncertain-illstructured-conditions_2021_Springer-Science-and-Busin.pdf from PDF to TEI...


Processando arquivos: 100%|██████████| 21/21 [13:13<00:00, 37.80s/file]

Processed On-predictive-entrepreneurial-action-in-uncertain-illstructured-conditions_2021_Springer-Science-and-Busin.pdf -> ../data/interim/On-predictive-entrepreneurial-action-in-uncertain-illstructured-conditions_2021_Springer-Science-and-Busin.pdf.grobid.tei.xml



