In [5]:
# 1. Import Necessary Modules
import torch
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset, Dataset
import os

# Ensure logging and output directories exist
os.makedirs("/content/logs", exist_ok=True)
os.makedirs("/content/gpt2-scientific-papers", exist_ok=True)

# Check if a GPU is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load Dataset from Hugging Face
print("Loading scientific_papers dataset from Hugging Face...")
dataset = None
try:
    dataset = load_dataset("scientific_papers", "arxiv", split="train")
    print(f"Loaded dataset with {len(dataset)} examples.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Falling back to mock dataset...")
    mock_data = [
        {
            "article": "This is a sample scientific article about AI advancements in 2025.",
            "abstract": "This paper explores recent breakthroughs in artificial intelligence.",
            "text": "Sample text for training purposes."
        }
        for _ in range(100)
    ]
    dataset = Dataset.from_list(mock_data)
    print(f"Using mock dataset with {len(dataset)} examples.")

# Take a small subset for a quick demo on CPU
dataset = dataset.select(range(min(1000, len(dataset))))
print(f"Total examples loaded: {len(dataset)}")

# 3. Initialize Tokenizer and Model
model_name = "gpt2"
tokenizer = None
model = None
eos_token = "<|endoftext|>"
try:
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2LMHeadModel.from_pretrained(model_name)
    print("Loaded GPT-2 successfully.")
except Exception as e:
    print(f"Error loading GPT-2: {e}")
    print("Please ensure internet access or upload GPT-2 to /content/gpt2_local.")

# 4. Format the Data
print("Formatting prompts and completions...")
def format_example(example):
    article = example.get("article", "Sample scientific article about AI advancements.")[:1000].split(".")[0].strip()
    abstract = example.get("abstract", "This paper explores AI breakthroughs.")[:1000].replace("\n", " ")
    token = tokenizer.eos_token if tokenizer else eos_token
    full_text = f"Title: {article}.\nAbstract: {abstract}{token}"
    return {"text": full_text}

formatted_dataset = dataset.map(format_example, remove_columns=dataset.column_names)

# Split the dataset
train_test_split = formatted_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

# 5. Tokenize (only if tokenizer available)
if tokenizer is None or model is None:
    print("No tokenizer/model available. Inspecting formatted data:")
    print(formatted_dataset[0])
    print("Ensure GPT-2 is accessible to proceed with training.")
    exit(0)

print("Tokenizing data...")
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,
        padding="max_length",
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_dataset.column_names)

# 6. Set Up Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 7. Training Arguments
training_args = TrainingArguments(
    output_dir="/content/gpt2-scientific-papers",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    eval_steps=200,
    save_steps=400,
    logging_dir="/content/logs",
    logging_steps=50,
    eval_strategy="steps",  # Fixed: Changed from evaluation_strategy
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=50,
    load_best_model_at_end=True,
    use_cpu=(device.type == "cpu"),
)

# 8. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

# 9. Train
print("Starting training...")
try:
    trainer.train()
except Exception as e:
    print(f"Training failed: {e}")
    exit(1)

# 10. Save Model
print("Saving model...")
model.save_pretrained("/content/fine_tuned_gpt2")
tokenizer.save_pretrained("/content/fine_tuned_gpt2")
print("Done!")

Using device: cuda
Loading scientific_papers dataset from Hugging Face...
Error loading dataset: Dataset scripts are no longer supported, but found scientific_papers.py
Falling back to mock dataset...
Using mock dataset with 100 examples.
Total examples loaded: 100
Loaded GPT-2 successfully.
Formatting prompts and completions...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Train dataset size: 90
Eval dataset size: 10
Tokenizing data...


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Starting training...


Step,Training Loss,Validation Loss


Saving model...
Done!


In [None]:
# 1. Import Libraries for Inference
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline
import torch

# 2. Load the Fine-Tuned Model and Tokenizer
model_path = "/content/fine_tuned_gpt2"
try:
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2LMHeadModel.from_pretrained(model_path)
    print("Loaded fine-tuned GPT-2 successfully.")
except Exception as e:
    print(f"Error loading fine-tuned model: {e}")
    print("Loading pre-trained 'gpt2' as fallback...")
    try:
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token
        model = GPT2LMHeadModel.from_pretrained("gpt2")
        print("Loaded pre-trained GPT-2.")
    except Exception as e2:
        print(f"Error loading pre-trained GPT-2: {e2}")
        exit(1)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

# 3. Create a Text Generation Pipeline
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# 4. Define a Function to Generate an Abstract
def generate_abstract(title, max_length=200):
    prompt = f"Title: {title}\nAbstract:"
    generated_sequences = text_generator(
        prompt,
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2,
        truncation=True
    )
    generated_text = generated_sequences[0]["generated_text"]
    abstract_start = generated_text.find("Abstract:") + len("Abstract:")
    generated_abstract = generated_text[abstract_start:].strip()

    # Clean up - stop at double newlines
    stop_index = generated_abstract.find("\n\n")
    if stop_index != -1:
        generated_abstract = generated_abstract[:stop_index]

    return generated_abstract

# 5. Simple Interactive Interface
print("\n" + "=" * 50)
print("ABSTRACT GENERATOR")
print("=" * 50)
print("Enter research titles to generate abstracts.")
print("Type 'quit' to exit.")
print("-" * 50)

while True:
    user_input = input("\nEnter research title: ")

    if user_input.lower() in ['quit', 'exit', 'stop', 'q']:
        print("Goodbye!")
        break

    if not user_input.strip():
        print("Please enter a valid title.")
        continue

    try:
        print("Generating...")
        abstract = generate_abstract(user_input)
        print("\nTitle:", user_input)
        print("Abstract:", abstract)
        print("-" * 50)

    except Exception as e:
        print("Error:", e)
        print("Please try again.")

Loaded fine-tuned GPT-2 successfully.


Device set to use cuda:0


Using device: cuda

ABSTRACT GENERATOR
Enter research titles to generate abstracts.
Type 'quit' to exit.
--------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generating...

Title: provide me topic for this paper
Abstract: This paper explores recent breakthroughs in artificial intelligence. The authors propose an approach to understanding AI advancements since 2008. They show that these advances are driven by humans rather than machines, and argue against technological progress towards a more intelligent future. Implications of such developments include the potential danger posed by automation technology; how we might improve our ability at work today; whether human-driven change could lead us toward ever higher levels from which robots may never reach them; what kinds about tomorrow's technologies will enable humanity? These results demonstrate predictions regarding next steps forward on Artificial Intelligence. Keywords:"AI", "advances"
--------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generating...

Title: which type of que i ask you which is your feild  
Abstract: This paper explores recent breakthroughs in artificial intelligence. The abstract describes current research findings about AI advancements in 2025, along with predictions for tomorrow.
--------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generating...

Title: 2+2
Abstract: This paper explores recent breakthroughs in artificial intelligence. The abstract is drawn from existing research on AI advancements in 2025. While this article focuses primarily upon previous work, it argues that advances today require human-level efforts to improve our understanding of natural phenomena like lifeforms and intelligent machines.[/emphasis]
--------------------------------------------------
