In [12]:
# !pip install pdfquery
# !pip install pandas
# !pip install pdfminer.six

### Papers Sections

In [None]:
import os
import pdfminer
from pdfminer.high_level import extract_text
import pandas as pd

In [13]:
# Function to extract text from a PDF using pdfminer
def extract_text_from_pdf(pdf_path):
    text = extract_text(pdf_path)
    return text

# Function to extract sections based on keywords from extracted text
def extract_section(text, section_name):
    # Search for the section and extract the relevant part of the text
    start = text.lower().find(section_name.lower())
    if start != -1:
        end = text.find('\n', start + len(section_name))  # Find the end of the section
        return text[start:end].strip()  # Extract the section text
    return ''

# Function to process all PDFs in the directory
def extract_from_pdfs(pdf_dir):
    # List all PDFs in the directory
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
    
    data = []
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        
        # Extract sections (abstract, introduction, conclusion)
        abstract = extract_section(text, "Abstract")
        introduction = extract_section(text, "Introduction")
        conclusion = extract_section(text, "Conclusion")
        
        # Add the extracted data to the list
        data.append({
            "PDF": pdf_file,
            "Abstract": abstract,
            "Introduction": introduction,
            "Conclusion": conclusion
        })
    
    return data

# Function to save extracted data to a CSV file
def save_to_csv(data, output_file):
    df = pd.DataFrame(data)
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

# Define the path to your PDFs
pdf_dir = '../Data/Attribute_Papers/'

# Extract data from the PDFs
extracted_data = extract_from_pdfs(pdf_dir)

# Save the extracted data to a CSV
save_to_csv(extracted_data, 'extracted_text.csv')


The PDF <_io.BufferedReader name='../Data/Attribute_Papers/The complexification of the United Nations system.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Data saved to extracted_text.csv


### Text Extraction

In [14]:
import fitz  # PyMuPDF
import re


In [18]:
# Step 1: Extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    document = fitz.open(pdf_path)
    
    # Extract text from each page
    full_text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        full_text += page.get_text("text")  # Extract raw text from the page
    
    return full_text

# Step 2: Segment the extracted text into sections based on heuristics
def segment_text_into_sections(text):
    # Heuristic to detect headings - large font sizes or bold text
    section_pattern = re.compile(r'([A-Z][A-Z\s]+|[A-Za-z0-9\s]+[A-Za-z])(\n|\s+)', re.MULTILINE)
    
    # Find headings (titles or subheadings)
    headings = section_pattern.findall(text)
    
    # Split the text into sections based on headings
    sections = []
    start_index = 0
    for heading, _ in headings:
        # Find the position of the heading
        section_start = text.find(heading, start_index)
        
        # The end of the section is where the next heading begins or end of text
        next_heading = text.find(headings[headings.index((heading, _)) + 1][0], section_start) if (headings.index((heading, _)) + 1) < len(headings) else len(text)
        
        section_content = text[section_start:next_heading].strip()
        
        # Store the section with its heading
        sections.append({"heading": heading.strip(), "content": section_content})
        start_index = next_heading
    
    return sections

# Step 3: Example function to process a single PDF
def process_pdf(pdf_path):
    # Extract text from PDF
    extracted_text = extract_text_from_pdf(pdf_path)
    
    # Segment text into sections
    sections = segment_text_into_sections(extracted_text)
    
    # Display the extracted sections (for inspection)
    for section in sections:
        print(f"Section: {section['heading']}")
        print(f"Content: {section['content'][:1000]}...")  # Print first 300 chars of content for preview
        print("-" * 80)  # Separator for clarity

In [20]:
# Example usage with a single PDF file
pdf_path = '../Data/Attribute_Papers/Why states act through formal international organizations.pdf'
process_pdf(pdf_path)


Section: Why States Act through Formal International Organizations
Content: Why States Act through Formal International Organizations
Author(s):...
--------------------------------------------------------------------------------
Section: Kenneth
Content: Kenneth W....
--------------------------------------------------------------------------------
Section: Abbott and  Duncan Snidal
Content: Abbott and  Duncan Snidal
Source:...
--------------------------------------------------------------------------------
Section: The Journal of Conflict
Content: The Journal of Conflict Resolution, Vol. 42, No. 1 (Feb., 1998), pp. 3-...
--------------------------------------------------------------------------------
Section: 32
Published
Content: 32
Published by:...
--------------------------------------------------------------------------------
Section: Sage
Content: Sage Publications, Inc....
--------------------------------------------------------------------------------
Section: Stable
Content: St

In [21]:
!pip install PyMuPDF scikit-learn transformers




In [22]:
import fitz  # PyMuPDF

def extract_text_with_metadata(pdf_path):
    document = fitz.open(pdf_path)
    text_data = []

    for page_num in range(len(document)):
        page = document.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]  # Get text blocks with layout info
        
        for block in blocks:
            if block['type'] == 0:  # Text block
                for line in block['lines']:
                    line_text = line['spans'][0]['text']
                    font_size = line['spans'][0]['size']
                    font_bold = line['spans'][0].get('font', '').startswith('bold')
                    bbox = line['spans'][0]['bbox']  # Bounding box to track position

                    # Append the extracted text with metadata (font size, bold, bbox)
                    text_data.append({
                        "text": line_text,
                        "font_size": font_size,
                        "font_bold": font_bold,
                        "bbox": bbox
                    })
    
    return text_data


In [23]:
import re

def segment_text_into_sections(text_data):
    sections = []
    current_section = ""
    current_heading = "Introduction"  # Default heading
    prev_font_size = None

    for line in text_data:
        line_text = line["text"].strip()

        # Heuristic 1: Identify headings based on larger font sizes or bold text
        if line["font_size"] > 12 or line["font_bold"]:
            if current_section:  # If there's content from previous section, store it
                sections.append({"heading": current_heading, "content": current_section.strip()})
            
            # Set new section heading based on the text (e.g., if it's a title or heading)
            current_heading = line_text if re.match(r"[A-Za-z]+", line_text) else "Unknown Section"
            current_section = ""  # Reset content for new section

        else:
            current_section += " " + line_text  # Add line content to current section

    # Append the last section
    if current_section:
        sections.append({"heading": current_heading, "content": current_section.strip()})

    return sections


In [24]:
def clean_and_structure_sections(sections):
    cleaned_sections = []
    
    for section in sections:
        heading = section["heading"]
        content = section["content"]
        
        # Clean the content (e.g., remove extra spaces, newlines, etc.)
        content = re.sub(r'\s+', ' ', content).strip()
        
        cleaned_sections.append({
            "heading": heading,
            "content": content
        })

    return cleaned_sections


In [25]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load pre-trained SciBERT model (or any model fine-tuned for scientific text)
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_cased")

def extract_themes_with_scibert(sections):
    themes = []
    
    for section in sections:
        heading = section["heading"]
        content = section["content"]
        
        # Tokenize the content
        inputs = tokenizer(content, return_tensors="pt", truncation=True, padding=True, max_length=512)
        
        # Predict the theme or classification
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()

        themes.append({
            "heading": heading,
            "predicted_theme": predicted_class
        })
    
    return themes


2024-12-12 09:54:37.434280: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733986477.664481   17908 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733986477.729909   17908 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-12 09:54:38.320043: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
def process_pdf_for_themes(pdf_path):
    # Step 1: Extract text and metadata from the PDF
    text_data = extract_text_with_metadata(pdf_path)
    
    # Step 2: Segment text into sections based on heuristics
    sections = segment_text_into_sections(text_data)
    
    # Step 3: Clean and structure the text
    cleaned_sections = clean_and_structure_sections(sections)
    
    # Step 4: Analyze sections for themes using SciBERT
    themes = extract_themes_with_scibert(cleaned_sections)
    
    # Display results
    for theme in themes:
        print(f"Section: {theme['heading']}")
        print(f"Predicted Theme: {theme['predicted_theme']}")
        print("-" * 80)


In [28]:
# Example usage
pdf_path = pdf_path
result = process_pdf_for_themes(pdf_path)


Section: Introduction
Predicted Theme: 0
--------------------------------------------------------------------------------
Section: International Organizations
Predicted Theme: 1
--------------------------------------------------------------------------------


In [29]:
result

In [30]:
import pandas as pd
import fitz  # PyMuPDF
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import re
import os

# Initialize SciBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_cased")

def extract_text_with_metadata(pdf_path):
    document = fitz.open(pdf_path)
    text_data = []
    
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]  # Get text blocks with layout info
        
        for block in blocks:
            if block['type'] == 0:  # Text block
                for line in block['lines']:
                    line_text = line['spans'][0]['text']
                    font_size = line['spans'][0]['size']
                    font_bold = line['spans'][0].get('font', '').startswith('bold')
                    bbox = line['spans'][0]['bbox']  # Bounding box to track position
                    
                    # Append the extracted text with metadata
                    text_data.append({
                        "text": line_text,
                        "font_size": font_size,
                        "font_bold": font_bold,
                        "bbox": bbox
                    })
    
    return text_data

def segment_text_into_sections(text_data):
    sections = []
    current_section = ""
    current_heading = "Introduction"  # Default heading
    prev_font_size = None

    for line in text_data:
        line_text = line["text"].strip()
        
        # Heuristic 1: Identify headings based on larger font sizes or bold text
        if line["font_size"] > 12 or line["font_bold"]:
            if current_section:  # If there's content from the previous section, store it
                sections.append({"heading": current_heading, "content": current_section.strip()})
            
            # Set new section heading based on the text
            current_heading = line_text if re.match(r"[A-Za-z]+", line_text) else "Unknown Section"
            current_section = ""  # Reset content for new section

        else:
            current_section += " " + line_text  # Add line content to current section

    # Append the last section
    if current_section:
        sections.append({"heading": current_heading, "content": current_section.strip()})

    return sections

def clean_and_structure_sections(sections):
    cleaned_sections = []
    
    for section in sections:
        heading = section["heading"]
        content = section["content"]
        
        # Clean the content (e.g., remove extra spaces, newlines, etc.)
        content = re.sub(r'\s+', ' ', content).strip()
        
        cleaned_sections.append({
            "heading": heading,
            "content": content
        })

    return cleaned_sections

def extract_themes_with_scibert(sections):
    themes = []
    
    for section in sections:
        heading = section["heading"]
        content = section["content"]
        
        # Tokenize the content
        inputs = tokenizer(content, return_tensors="pt", truncation=True, padding=True, max_length=512)
        
        # Predict the theme or classification
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()

        themes.append({
            "heading": heading,
            "predicted_theme": predicted_class
        })
    
    return themes

# Function to process the PDF and create a structured DataFrame
def process_pdf_for_themes(pdf_path):
    # Extract text and metadata from the PDF
    text_data = extract_text_with_metadata(pdf_path)
    
    # Segment the text into sections
    sections = segment_text_into_sections(text_data)
    
    # Clean and structure the sections
    cleaned_sections = clean_and_structure_sections(sections)
    
    # Extract themes using SciBERT
    themes = extract_themes_with_scibert(cleaned_sections)
    
    # Create a list of dictionaries for the DataFrame
    data = []
    
    for theme, section in zip(themes, cleaned_sections):
        journal_name = os.path.basename(pdf_path)  # Use the PDF filename as the journal name
        section_name = section["heading"]
        section_content = section["content"]
        predicted_theme = theme["predicted_theme"]
        
        data.append({
            "journal_name": journal_name,
            "section_name": section_name,
            "section_content": section_content,
            "predicted_theme": predicted_theme
        })
    
    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(data)
    
    return df



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# Example usage
pdf_path = pdf_path # Replace with the actual path to your PDF
df = process_pdf_for_themes(pdf_path)

# Display the DataFrame
print(df.head())


                                        journal_name  \
0  Why states act through formal international or...   
1  Why states act through formal international or...   

                  section_name  \
0                 Introduction   
1  International Organizations   

                                     section_content  predicted_theme  
0  Why States Act through Formal International Or...                1  
1  KENNETH W. ABBOTT Graduate and International S...                1  


In [32]:
df.head()

Unnamed: 0,journal_name,section_name,section_content,predicted_theme
0,Why states act through formal international or...,Introduction,Why States Act through Formal International Or...,1
1,Why states act through formal international or...,International Organizations,KENNETH W. ABBOTT Graduate and International S...,1


In [33]:
import pandas as pd
import fitz  # PyMuPDF
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import re
import os

# Initialize SciBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_cased")

# Define a theme mapping for the model's output
theme_mapping = {
    0: "Methodology",
    1: "Results",
    2: "Introduction",
    3: "Conclusion",
    4: "Discussion",
    5: "Related Work",
    6: "Future Work",
    7: "Abstract"
}

def extract_text_with_metadata(pdf_path):
    document = fitz.open(pdf_path)
    text_data = []
    
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]  # Get text blocks with layout info
        
        for block in blocks:
            if block['type'] == 0:  # Text block
                for line in block['lines']:
                    line_text = line['spans'][0]['text']
                    font_size = line['spans'][0]['size']
                    font_bold = line['spans'][0].get('font', '').startswith('bold')
                    bbox = line['spans'][0]['bbox']  # Bounding box to track position
                    
                    # Append the extracted text with metadata
                    text_data.append({
                        "text": line_text,
                        "font_size": font_size,
                        "font_bold": font_bold,
                        "bbox": bbox
                    })
    
    return text_data

def segment_text_into_sections(text_data):
    sections = []
    current_section = ""
    current_heading = "Introduction"  # Default heading
    prev_font_size = None

    for line in text_data:
        line_text = line["text"].strip()
        
        # Heuristic 1: Identify headings based on larger font sizes or bold text
        if line["font_size"] > 12 or line["font_bold"]:
            if current_section:  # If there's content from the previous section, store it
                sections.append({"heading": current_heading, "content": current_section.strip()})
            
            # Set new section heading based on the text
            current_heading = line_text if re.match(r"[A-Za-z]+", line_text) else "Unknown Section"
            current_section = ""  # Reset content for new section

        else:
            current_section += " " + line_text  # Add line content to current section

    # Append the last section
    if current_section:
        sections.append({"heading": current_heading, "content": current_section.strip()})

    return sections

def clean_and_structure_sections(sections):
    cleaned_sections = []
    
    for section in sections:
        heading = section["heading"]
        content = section["content"]
        
        # Clean the content (e.g., remove extra spaces, newlines, etc.)
        content = re.sub(r'\s+', ' ', content).strip()
        
        cleaned_sections.append({
            "heading": heading,
            "content": content
        })

    return cleaned_sections

def extract_themes_with_scibert(sections):
    themes = []
    
    for section in sections:
        heading = section["heading"]
        content = section["content"]
        
        # Tokenize the content
        inputs = tokenizer(content, return_tensors="pt", truncation=True, padding=True, max_length=512)
        
        # Predict the theme or classification
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()

        # Map the predicted class index to a human-readable theme
        predicted_theme = theme_mapping.get(predicted_class, "Unknown Theme")
        
        themes.append({
            "heading": heading,
            "predicted_theme_index": predicted_class,
            "predicted_theme": predicted_theme
        })
    
    return themes

# Function to process the PDF and create a structured DataFrame
def process_pdf_for_themes(pdf_path):
    # Extract text and metadata from the PDF
    text_data = extract_text_with_metadata(pdf_path)
    
    # Segment the text into sections
    sections = segment_text_into_sections(text_data)
    
    # Clean and structure the sections
    cleaned_sections = clean_and_structure_sections(sections)
    
    # Extract themes using SciBERT
    themes = extract_themes_with_scibert(cleaned_sections)
    
    # Create a list of dictionaries for the DataFrame
    data = []
    
    for theme, section in zip(themes, cleaned_sections):
        journal_name = os.path.basename(pdf_path)  # Use the PDF filename as the journal name
        section_name = section["heading"]
        section_content = section["content"]
        predicted_theme_index = theme["predicted_theme_index"]
        predicted_theme = theme["predicted_theme"]
        
        data.append({
            "journal_name": journal_name,
            "section_name": section_name,
            "section_content": section_content,
            "predicted_theme_index": predicted_theme_index,
            "predicted_theme": predicted_theme
        })
    
    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(data)
    
    return df

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:

# Example usage
pdf_path = pdf_path  # Replace with the actual path to your PDF
df = process_pdf_for_themes(pdf_path)

# Display the DataFrame
df.head()


Unnamed: 0,journal_name,section_name,section_content,predicted_theme_index,predicted_theme
0,Why states act through formal international or...,Introduction,Why States Act through Formal International Or...,0,Methodology
1,Why states act through formal international or...,International Organizations,KENNETH W. ABBOTT Graduate and International S...,0,Methodology


In [35]:
import os
import pandas as pd
import fitz  # PyMuPDF
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import re

# Initialize SciBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
model = BertForSequenceClassification.from_pretrained("allenai/scibert_scivocab_cased")

# Define a theme mapping for the model's output
theme_mapping = {
    0: "Methodology",
    1: "Results",
    2: "Introduction",
    3: "Conclusion",
    4: "Discussion",
    5: "Related Work",
    6: "Future Work",
    7: "Abstract"
}

def extract_text_with_metadata(pdf_path):
    document = fitz.open(pdf_path)
    text_data = []
    
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]  # Get text blocks with layout info
        
        for block in blocks:
            if block['type'] == 0:  # Text block
                for line in block['lines']:
                    line_text = line['spans'][0]['text']
                    font_size = line['spans'][0]['size']
                    font_bold = line['spans'][0].get('font', '').startswith('bold')
                    bbox = line['spans'][0]['bbox']  # Bounding box to track position
                    
                    # Append the extracted text with metadata
                    text_data.append({
                        "text": line_text,
                        "font_size": font_size,
                        "font_bold": font_bold,
                        "bbox": bbox
                    })
    
    return text_data

def segment_text_into_sections(text_data):
    sections = []
    current_section = ""
    current_heading = "Introduction"  # Default heading
    prev_font_size = None

    for line in text_data:
        line_text = line["text"].strip()
        
        # Heuristic 1: Identify headings based on larger font sizes or bold text
        if line["font_size"] > 12 or line["font_bold"]:
            if current_section:  # If there's content from the previous section, store it
                sections.append({"heading": current_heading, "content": current_section.strip()})
            
            # Set new section heading based on the text
            current_heading = line_text if re.match(r"[A-Za-z]+", line_text) else "Unknown Section"
            current_section = ""  # Reset content for new section

        else:
            current_section += " " + line_text  # Add line content to current section

    # Append the last section
    if current_section:
        sections.append({"heading": current_heading, "content": current_section.strip()})

    return sections

def clean_and_structure_sections(sections):
    cleaned_sections = []
    
    for section in sections:
        heading = section["heading"]
        content = section["content"]
        
        # Clean the content (e.g., remove extra spaces, newlines, etc.)
        content = re.sub(r'\s+', ' ', content).strip()
        
        cleaned_sections.append({
            "heading": heading,
            "content": content
        })

    return cleaned_sections

def extract_themes_with_scibert(sections):
    themes = []
    
    for section in sections:
        heading = section["heading"]
        content = section["content"]
        
        # Tokenize the content
        inputs = tokenizer(content, return_tensors="pt", truncation=True, padding=True, max_length=512)
        
        # Predict the theme or classification
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()

        # Map the predicted class index to a human-readable theme
        predicted_theme = theme_mapping.get(predicted_class, "Unknown Theme")
        
        themes.append({
            "heading": heading,
            "predicted_theme_index": predicted_class,
            "predicted_theme": predicted_theme
        })
    
    return themes

def process_pdf_for_themes(pdf_path):
    # Extract text and metadata from the PDF
    text_data = extract_text_with_metadata(pdf_path)
    
    # Segment the text into sections
    sections = segment_text_into_sections(text_data)
    
    # Clean and structure the sections
    cleaned_sections = clean_and_structure_sections(sections)
    
    # Extract themes using SciBERT
    themes = extract_themes_with_scibert(cleaned_sections)
    
    # Create a list of dictionaries for the DataFrame
    data = []
    
    for theme, section in zip(themes, cleaned_sections):
        journal_name = os.path.basename(pdf_path)  # Use the PDF filename as the journal name
        section_name = section["heading"]
        section_content = section["content"]
        predicted_theme_index = theme["predicted_theme_index"]
        predicted_theme = theme["predicted_theme"]
        
        data.append({
            "journal_name": journal_name,
            "section_name": section_name,
            "section_content": section_content,
            "predicted_theme_index": predicted_theme_index,
            "predicted_theme": predicted_theme
        })
    
    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(data)
    
    return df

# Function to process all PDFs in the folder and combine them into a single DataFrame
def process_pdfs_in_folder(pdf_folder_path):
    all_data = []

    # Loop through all PDF files in the folder
    for pdf_file in os.listdir(pdf_folder_path):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, pdf_file)
            df = process_pdf_for_themes(pdf_path)
            all_data.append(df)
    
    # Concatenate all individual DataFrames into one
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Example usage
pdf_folder_path = '../Data/Attribute_Papers/'  # Path to your folder containing PDFs
final_df = process_pdfs_in_folder(pdf_folder_path)

# # Display the final DataFrame
# print(final_df.head())

# # Optionally, save the result to a CSV or Excel file
# final_df.to_csv('extracted_sections_with_themes.csv', index=False)
# final_df.to_excel('extracted_sections_with_themes.xlsx', index=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 872 entries, 0 to 871
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   journal_name           872 non-null    object
 1   section_name           872 non-null    object
 2   section_content        872 non-null    object
 3   predicted_theme_index  872 non-null    int64 
 4   predicted_theme        872 non-null    object
dtypes: int64(1), object(4)
memory usage: 34.2+ KB


In [38]:
final_df.head()

Unnamed: 0,journal_name,section_name,section_content,predicted_theme_index,predicted_theme
0,International Organizations and Implementation...,Introduction,"Enforcers, managers, authorities? Internationa...",0,Methodology
1,International Organizations and Implementation...,"Jutta Joachim, Bob Reinalda and Bertjan Verbeek",1 Introduction This edited volume has explored...,0,Methodology
2,Critical Choices_The United Nations_Networks a...,ADVISORY,Tunku Former Commonwealth Secretariat and Tran...,0,Methodology
3,Critical Choices_The United Nations_Networks a...,CHOICES,"WOLFGANG AND with JAN THORSTEN BENNER, BETH AN...",0,Methodology
4,Critical Choices_The United Nations_Networks a...,CONTENTS,Foreword vii Preface ix Executive xi Origins x...,1,Results
