In [None]:
# Install Required Libraries
!pip install transformers pytesseract pymarc datasets pandas scikit-learn --quiet

# Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from pymarc import Record, Field
from google.colab import files

# Step 1: Simulate Training Data for Fine-Tuning
data = [
    {"input": "Extract metadata from: Artificial Intelligence: A Modern Approach by Stuart Russell, 2020",
     "output": "title: Artificial Intelligence: A Modern Approach | author: Stuart Russell | year: 2020"},
    {"input": "Extract metadata from: Deep Learning by Ian Goodfellow, published by MIT Press in 2016",
     "output": "title: Deep Learning | author: Ian Goodfellow | publisher: MIT Press | year: 2016"},
    {"input": "Extract metadata from: Python Crash Course by Eric Matthes, No Starch Press, 2019",
     "output": "title: Python Crash Course | author: Eric Matthes | publisher: No Starch Press | year: 2019"},
    {"input": "Extract metadata from: Clean Code: A Handbook of Agile Software Craftsmanship by Robert C. Martin, 2008",
     "output": "title: Clean Code | author: Robert C. Martin | year: 2008"}
]

# Convert to DataFrame
df = pd.DataFrame(data)

# Split Data into Training and Testing
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Convert DataFrames to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Step 2: Load Pre-Trained T5 Model and Tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization Function
def tokenize_data(examples):
    inputs = tokenizer(examples["input"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["output"], max_length=512, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Apply Tokenization
tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

# Step 3: Fine-Tune the Model
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer
)

print("Fine-tuning the model...")
trainer.train()

# Save Fine-Tuned Model
model.save_pretrained("./fine_tuned_t5_model")
tokenizer.save_pretrained("./fine_tuned_t5_model")
print("Fine-tuned model saved to './fine_tuned_t5_model'")

# Step 4: Cataloging Pipeline with Fine-Tuned Model
def extract_metadata(text):
    """
    Use fine-tuned model to extract metadata from text.
    """
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Parse metadata into structured format
    metadata = {}
    for item in result.split('|'):
        key, value = item.split(':', 1)
        metadata[key.strip()] = value.strip()
    return metadata

def create_marc21(metadata):
    """
    Generate MARC21 record from extracted metadata.
    """
    try:
        record = Record()
        record.add_field(Field(tag='245', indicators=['1', '0'], subfields=[
            'a', metadata.get('title', ''),
            'c', metadata.get('author', '')
        ]))
        record.add_field(Field(tag='264', indicators=['#', '1'], subfields=[
            'b', metadata.get('publisher', ''),
            'c', metadata.get('year', '')
        ]))
        return record
    except Exception as e:
        print(f"Error creating MARC21 record: {e}")
        return None

# Full Cataloging Pipeline
def catalog_pipeline():
    """
    End-to-end cataloging: text -> metadata -> MARC21.
    """
    print("Upload a text file for cataloging:")
    uploaded = files.upload()
    file_name = next(iter(uploaded))
    with open(file_name, 'r') as file:
        input_text = file.read()

    # Step 1: Extract Metadata
    metadata = extract_metadata(input_text)
    print("Extracted Metadata:", metadata)

    # Step 2: Create MARC21 Record
    marc_record = create_marc21(metadata)
    if marc_record:
        print("Generated MARC21 Record:")
        print(marc_record)

        # Step 3: Save MARCXML
        output_file = "output.marcxml"
        with open(output_file, "wb") as f:
            f.write(marc_record.as_marcxml())
        files.download(output_file)
        print("Pipeline completed. MARCXML file ready for download.")
    else:
        print("Failed to generate MARC21 record.")

# Run the Cataloging Pipeline
catalog_pipeline()
