In [None]:
# Step 1: Install necessary libraries
!pip install PyPDF2 pymarc rdflib

# Step 2: Upload the PDF file
from google.colab import files
import PyPDF2
from pymarc import Record, Field, MARCWriter, Subfield
from rdflib import Graph, URIRef, Literal, RDF, Namespace

# Upload the PDF file
uploaded = files.upload()

# Get the file name of the uploaded PDF
pdf_file_path = list(uploaded.keys())[0]

# Step 3: Extract Metadata from the PDF
with open(pdf_file_path, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    metadata = reader.metadata

    # Extract common metadata fields from the PDF
    title = metadata.get('/Title', 'No Title')
    subtitle = metadata.get('/Subtitle', '')
    author = metadata.get('/Author', 'No Author')
    publisher = metadata.get('/Producer', 'No Publisher')
    publication_date = metadata.get('/CreationDate', 'No Date')
    isbn = metadata.get('/ISBN', 'No ISBN')
    place_of_publication = 'Unknown'
    edition = metadata.get('/Edition', 'No Edition')
    language = 'Unknown'
    keywords = metadata.get('/Keywords', 'No Keywords')
    summary = metadata.get('/Summary', 'No Summary')
    subject = metadata.get('/Subject', 'No Subject')
    series_title = 'No Series'
    physical_description = 'No Description'

# Step 4: Create MARC21 Record Based on Extracted Metadata
record = Record()

# Title and Subtitle
record.add_field(
    Field(
        tag='245',  # Title Statement
        indicators=['1', '0'],
        subfields=[
            Subfield(code='a', value=title),
            Subfield(code='b', value=subtitle)
        ]
    )
)

# Author
record.add_field(
    Field(
        tag='100',  # Main Entry - Personal Name
        indicators=['1', ' '],
        subfields=[Subfield(code='a', value=author)]
    )
)

# Publisher
record.add_field(
    Field(
        tag='260',  # Publication, Distribution, etc.
        indicators=[' ', ' '],
        subfields=[
            Subfield(code='a', value=place_of_publication),
            Subfield(code='b', value=publisher),
            Subfield(code='c', value=publication_date)
        ]
    )
)

# ISBN
record.add_field(
    Field(
        tag='020',  # ISBN
        indicators=[' ', ' '],
        subfields=[Subfield(code='a', value=isbn)]
    )
)

# Series Title
record.add_field(
    Field(
        tag='490',  # Series Statement
        indicators=['1', ' '],
        subfields=[Subfield(code='a', value=series_title)]
    )
)

# Edition
record.add_field(
    Field(
        tag='250',  # Edition Statement
        indicators=[' ', ' '],
        subfields=[Subfield(code='a', value=edition)]
    )
)

# Physical Description
record.add_field(
    Field(
        tag='300',  # Physical Description
        indicators=[' ', ' '],
        subfields=[Subfield(code='a', value=physical_description)]
    )
)

# Language
record.add_field(
    Field(
        tag='041',  # Language Code
        indicators=[' ', ' '],
        subfields=[Subfield(code='a', value=language)]
    )
)

# Subject (Keywords)
record.add_field(
    Field(
        tag='650',  # Subject Added Entry - Topical Term
        indicators=[' ', '0'],
        subfields=[Subfield(code='a', value=keywords)]
    )
)

# Summary
record.add_field(
    Field(
        tag='520',  # Summary Note
        indicators=[' ', ' '],
        subfields=[Subfield(code='a', value=summary)]
    )
)

# Rights Information
record.add_field(
    Field(
        tag='540',  # Terms Governing Use and Reproduction
        indicators=[' ', ' '],
        subfields=[Subfield(code='a', value="Public Domain or under license.")]
    )
)

# Step 5: Save MARC21 Record to a .mrc File
marc_file_path = '/content/marc_record.mrc'
with open(marc_file_path, 'wb') as file:
    writer = MARCWriter(file)
    writer.write(record)
    writer.close()

print(f'MARC21 record saved to {marc_file_path}')

# Step 6: Generate RDF Metadata and save as .txt
g = Graph()
ex = Namespace("http://example.org/")

# Add extracted metadata to the RDF graph
book = URIRef("http://example.org/book/1")
g.add((book, RDF.type, ex.Book))
if title != 'No Title':
    g.add((book, ex.title, Literal(title)))
if author != 'No Author':
    g.add((book, ex.author, Literal(author)))
if publisher != 'No Publisher':
    g.add((book, ex.publisher, Literal(publisher)))
if subject != 'No Subject':
    g.add((book, ex.subject, Literal(subject)))
if publication_date != 'No Date':
    g.add((book, ex.creationDate, Literal(publication_date)))

# Serialize RDF data to a text file (using Turtle format)
rdf_text_file_path = '/content/rdf_metadata.txt'
with open(rdf_text_file_path, 'w') as rdf_file:
    rdf_file.write(g.serialize(format='turtle'))  # No need for .decode('utf-8')

print(f'RDF data saved to {rdf_text_file_path}')

# Step 7: Download the MARC21 and RDF Text Files
files.download(marc_file_path)  # Download MARC21 file
files.download(rdf_text_file_path)  # Download RDF metadata as text file


