# Project Title: Research Paper Metadata Extractor

In [1]:
import fitz  # PyMuPDF
import pandas as pd
import os
import json
from langchain.document_loaders import PyMuPDFLoader
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOpenAI
from dotenv import load_dotenv

In [2]:
# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
# If API Key is not found, prompt the user to enter it
if not OPENAI_API_KEY:
    OPENAI_API_KEY = input("Enter your OpenAI API Key: ")

In [6]:
# Initialize OpenAI model using ChatOpenAI
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=OPENAI_API_KEY)


  llm = ChatOpenAI(model_name="gpt-4", openai_api_key=OPENAI_API_KEY)


In [7]:
# Define metadata extraction prompt
template = PromptTemplate(
    input_variables=["text"],
    template="""
    Extract the following metadata from the given research paper text:
    - Title
    - Authors
    - Publication Year
    - Abstract
    - Keywords
    
    Given Text:
    {text}
    
    Provide the extracted details strictly in **valid JSON** format like this:
    ```json
    {{
        "Title": "Extracted Title",
        "Authors": "Extracted Authors",
        "Publication Year": "Extracted Year",
        "Abstract": "Extracted Abstract",
        "Keywords": "Extracted Keywords"
    }}
    ```
    Ensure the output follows **valid JSON syntax**, enclosed in `{}`.
    """
)

metadata_extraction_chain = LLMChain(llm=llm, prompt=template)

def extract_metadata_from_pdf(pdf_path):
    """Extract metadata from PDF using LangChain."""
    try:
        loader = PyMuPDFLoader(pdf_path)
        documents = loader.load()
        full_text = "\n".join([doc.page_content.strip() for doc in documents if doc.page_content.strip()])
        
        # Check if any text was extracted
        if not full_text.strip():
            print("Error: No text extracted from the PDF. Ensure the PDF contains text.")
            return {}
        
        # Log the input to the chain for debugging
        print(f"Input to chain: {full_text[:500]}...")  # Print the first 500 characters
        
        # Invoke the chain
        response = metadata_extraction_chain.invoke({"text": full_text})
        
        # Check if the response is empty or malformed
        if not response or "text" not in response:
            print("Error: Received empty or malformed response from OpenAI.")
            return {}
        
        # Parse the JSON output
        try:
            metadata = json.loads(response["text"]) if "text" in response else json.loads(response)
        except json.JSONDecodeError as e:
            print(f"Error: Received malformed JSON response from OpenAI: {e}")
            return {}
        
        return metadata
    except Exception as e:
        print(f"Error extracting metadata: {e}")
        return {}


  metadata_extraction_chain = LLMChain(llm=llm, prompt=template)


In [8]:
# Specify the PDF file path
pdf_path = r"AI-based approach for improving the detection of.pdf"


In [9]:
# Check if the file exists and extract metadata
if os.path.exists(pdf_path):
    metadata = extract_metadata_from_pdf(pdf_path)
    
    # Convert metadata to DataFrame
    if metadata:
        df = pd.DataFrame([metadata])
        print("Extracted Metadata:")
        print(df)
        
        # Save metadata to CSV
        df.to_csv("metadata.csv", index=False)
        print("Metadata saved to metadata.csv")
    else:
        print("No metadata extracted.")
else:
    print("PDF file not found. Please check the file path.")

Input to chain: AI-based approach for improving the detection of
blood doping in sports
Maxx Richard Rahman
Saarland University
Germany
Jacob Bejder
University of Copenhagen
Denmark
Thomas Christian Bonne
University of Copenhagen
Denmark
Andreas Breenfeldt Andersen
University of Copenhagen
Denmark
Jes´us Rodr´ıguez Huertas
University of Granada
Spain
Reid Aikin
World Anti-Doping Agency
Canada
Nikolai Baastrup Nordsborg
University of Copenhagen
Denmark
Wolfgang Maaß
Saarland University
Germany
Abstract—Sports of...
Error extracting metadata: Missing some input keys: {''}
No metadata extracted.
