In [25]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
from transformers import AutoTokenizer, GPTNeoForCausalLM, Trainer, TrainingArguments
import os
import json
import re

In [26]:
df = pd.read_csv("metadata.csv")


  df = pd.read_csv("metadata.csv")


In [27]:
def read_json_files(paths):
    """Reads up to 100 valid JSON files from a list of paths, skipping non-existent or invalid ones."""
    json_data = []
    skipped_files = []

    for path in paths:
        path = str(path).strip()
        if not path or path.lower() == "nan" or not path.endswith(".json"):
            continue
        if os.path.exists(path):
            try:
                with open(path, "r", encoding="utf-8") as f:
                    json_data.append(json.load(f))
            except json.JSONDecodeError as e:
                print(f"⚠️ JSON decode error in file {path}: {e}")
        else:
            skipped_files.append(path)

        if len(json_data) >= 100:
            break  # stop after successfully reading 100 JSON files

    print(f"🔍 Skipped {len(skipped_files)} missing files.")
    return json_data


# Combine both JSON path columns and split all entries by ;
df["all_json_files"] = df[["pdf_json_files", "pmc_json_files"]].astype(str).agg(";".join, axis=1)

# Clean and extract individual paths
all_paths = (
    df["all_json_files"]
    .str.split(";")
    .explode()
    .dropna()
    .map(str.strip)
    .tolist()
)

# Limit to first 100 paths
limited_paths = all_paths[:200]  # Read a bit more to account for skipped ones

# Read valid files only (max 100 successful reads)
json_contents = read_json_files(limited_paths)

print(f"\n✅ Total JSON files successfully read: {len(json_contents)}")

🔍 Skipped 0 missing files.

✅ Total JSON files successfully read: 100


In [28]:
# ---------------------------
# Helper function: Extract text from JSON
# ---------------------------
def extract_text(json_obj):
    """
    Extracts and concatenates text from the 'abstract' and 'body_text' fields.
    """
    text_parts = []
    # Process abstract (it can be a list or a string)
    if "abstract" in json_obj:
        if isinstance(json_obj["abstract"], list):
            for item in json_obj["abstract"]:
                if isinstance(item, str):
                    text_parts.append(item)
                elif isinstance(item, dict):
                    text_parts.append(item.get("text", ""))
        elif isinstance(json_obj["abstract"], str):
            text_parts.append(json_obj["abstract"])
    
    # Process body_text (usually a list of sections)
    if "body_text" in json_obj:
        if isinstance(json_obj["body_text"], list):
            for item in json_obj["body_text"]:
                if isinstance(item, dict):
                    text_parts.append(item.get("text", ""))
                elif isinstance(item, str):
                    text_parts.append(item)
        elif isinstance(json_obj["body_text"], str):
            text_parts.append(json_obj["body_text"])
    
    return " ".join(text_parts).strip()

# ---------------------------
# Build DataFrame from JSON files
# ---------------------------
records = []
for json_obj in json_contents:
    paper_id = json_obj.get("paper_id", "Unknown")
    metadata = json_obj.get("metadata", {})
    title = metadata.get("title", "No Title")
    
    # Extract authors as a comma-separated string
    authors_list = metadata.get("authors", [])
    if isinstance(authors_list, list) and authors_list:
        authors = []
        for a in authors_list:
            if isinstance(a, dict):
                # Use "name" key if available, otherwise combine first and last names
                if "name" in a:
                    authors.append(a["name"])
                else:
                    fname = a.get("first", "")
                    lname = a.get("last", "")
                    authors.append((fname + " " + lname).strip())
            elif isinstance(a, str):
                authors.append(a)
        authors = ", ".join(authors)
    else:
        authors = "Unknown"
    
    # Extract combined text and compute statistics
    text = extract_text(json_obj)
    word_count = len(re.findall(r'\w+', text))
    char_count = len(text)
    
    # Extract sections from body_text (if available)
    sections = []
    if "body_text" in json_obj and isinstance(json_obj["body_text"], list):
        for item in json_obj["body_text"]:
            if isinstance(item, dict):
                sec = item.get("section", "").strip()
                if sec:
                    sections.append(sec)
    sections_str = ", ".join(set(sections)) if sections else "N/A"
    
    records.append({
        "paper_id": paper_id,
        "title": title,
        "authors": authors,
        "word_count": word_count,
        "char_count": char_count,
        "sections": sections_str,
        "text": text  # complete extracted text for further analysis
    })

eda_df = pd.DataFrame(records)


In [29]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [30]:
# Prepare dataset from your DataFrame
texts = eda_df["text"].tolist()
data = {"text": texts}
dataset = Dataset.from_dict(data)


In [31]:
# Initialize tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Add pad token


In [32]:
def tokenize_function(examples):
    encoding = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    encoding["labels"] = encoding["input_ids"].copy()  # This is the key fix
    return encoding
# Tokenize and format dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format(type="torch")

Map: 100%|██████████| 100/100 [00:03<00:00, 25.39 examples/s]


In [33]:
# Load model and move it to GPU
model = GPT2LMHeadModel.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.eos_token_id
model = model.to(device)

In [34]:
# Training arguments with GPU support
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),  # Mixed precision if GPU supports it
)

In [35]:
# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

In [36]:
trainer.train()


 12%|█▏        | 6/50 [00:00<00:05,  8.20it/s]

{'loss': 3.5528, 'grad_norm': 12.996912002563477, 'learning_rate': 4.7e-05, 'epoch': 0.1}


 20%|██        | 10/50 [00:01<00:05,  7.52it/s]

{'loss': 3.6893, 'grad_norm': 14.428654670715332, 'learning_rate': 4.2e-05, 'epoch': 0.2}


 32%|███▏      | 16/50 [00:04<00:08,  3.98it/s]

{'loss': 3.5464, 'grad_norm': 13.377995491027832, 'learning_rate': 3.7e-05, 'epoch': 0.3}


 40%|████      | 20/50 [00:04<00:04,  6.05it/s]

{'loss': 3.5801, 'grad_norm': 13.512916564941406, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.4}


 52%|█████▏    | 26/50 [00:07<00:05,  4.11it/s]

{'loss': 3.804, 'grad_norm': 13.241741180419922, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.5}


 60%|██████    | 30/50 [00:08<00:03,  6.20it/s]

{'loss': 3.5694, 'grad_norm': 12.353569984436035, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.6}


 72%|███████▏  | 36/50 [00:11<00:03,  3.94it/s]

{'loss': 3.4782, 'grad_norm': 13.036826133728027, 'learning_rate': 1.8e-05, 'epoch': 0.7}


 80%|████████  | 40/50 [00:11<00:01,  6.08it/s]

{'loss': 3.4286, 'grad_norm': 14.68310546875, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.8}


 92%|█████████▏| 46/50 [00:14<00:00,  4.12it/s]

{'loss': 3.3845, 'grad_norm': 14.913464546203613, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.9}


100%|██████████| 50/50 [00:15<00:00,  6.05it/s]

{'loss': 3.4968, 'grad_norm': 14.447256088256836, 'learning_rate': 3e-06, 'epoch': 1.0}


100%|██████████| 50/50 [00:17<00:00,  2.90it/s]

{'train_runtime': 17.2184, 'train_samples_per_second': 5.808, 'train_steps_per_second': 2.904, 'train_loss': 3.553013801574707, 'epoch': 1.0}





TrainOutput(global_step=50, training_loss=3.553013801574707, metrics={'train_runtime': 17.2184, 'train_samples_per_second': 5.808, 'train_steps_per_second': 2.904, 'total_flos': 6532300800000.0, 'train_loss': 3.553013801574707, 'epoch': 1.0})

In [37]:
# Save model and tokenizer
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


('./gpt2-finetuned\\tokenizer_config.json',
 './gpt2-finetuned\\special_tokens_map.json',
 './gpt2-finetuned\\vocab.json',
 './gpt2-finetuned\\merges.txt',
 './gpt2-finetuned\\added_tokens.json')

In [38]:
def answer_query(query, model, tokenizer, max_length=100):
    model.eval()
    input_ids = tokenizer.encode(query, return_tensors="pt").to(device)
    output_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [39]:
# Example usage
query = "What does the paper say about COVID-19 in Russia?"
print("Query Answer:", answer_query(query, model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Query Answer: What does the paper say about COVID-19 in Russia?

The paper says that the Russian government has been working on a plan to develop a new type of nuclear weapon, the "Chernobyl-type" nuclear weapon. The plan is to develop a new type of nuclear weapon, the "Chernobyl-type" nuclear weapon, which is a type of nuclear weapon that can be used to destroy the entire Soviet Union. The plan is to develop a new type of nuclear weapon


In [40]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

In [41]:
def answer_query(query, model, tokenizer, max_length=100):
    model.eval()
    input_ids = tokenizer.encode(query, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [42]:
queries = [
    "What are the symptoms of COVID-19?",
    "What treatments were proposed for the coronavirus?",
    "How does the virus spread between individuals?",
    "What are the effects of lockdowns on mental health?",
    "Describe the impact of COVID-19 on the global economy.",
    "What role do vaccines play in controlling the pandemic?",
    "Explain the transmission mechanism of SARS-CoV-2.",
    "How did COVID-19 affect healthcare systems in India?",
    "Was there any research on mask effectiveness?",
    "What mutations have been identified in COVID-19 variants?"
]

for q in queries:
    print(f"\n🧠 Query: {q}")
    print("📄 Answer:", answer_query(q, model, tokenizer))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🧠 Query: What are the symptoms of COVID-19?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📄 Answer: What are the symptoms of COVID-19?

The symptoms of COVID-19 are similar to those of other respiratory infections, such as pneumonia, bronchiolitis, and bronchiolitis. The symptoms of COVID-19 are similar to those of other respiratory infections, such as pneumonia, bronchiolitis, and bronchiolitis. The symptoms of COVID-19 are similar to those of other respiratory infections, such as pneumonia, bronchiolitis, and

🧠 Query: What treatments were proposed for the coronavirus?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📄 Answer: What treatments were proposed for the coronavirus?

The first was the coronavirus coronavirus (CVD). The coronavirus was first described in 1885 by the late Dr. William H. H. Haldane, Jr., who described it as "a disease of the heart, which is the most common cause of death in the United States." The coronavirus was first described in 1885 by Dr. William H. Haldane, Jr., who described

🧠 Query: How does the virus spread between individuals?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📄 Answer: How does the virus spread between individuals?

The virus is a viral RNA virus that is found in the blood of infected individuals. It is a viral RNA virus that is found in the blood of infected individuals. It is a viral RNA virus that is found in the blood of infected individuals. It is a viral RNA virus that is found in the blood of infected individuals. It is a viral RNA virus that is found in the blood of infected individuals. It is a viral RNA virus that is found

🧠 Query: What are the effects of lockdowns on mental health?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📄 Answer: What are the effects of lockdowns on mental health?

The most common cause of lockdowns is a lack of access to mental health care. In the United States, approximately one in four people with mental illness is hospitalized for a mental health condition. In addition, approximately one in four people with mental illness is hospitalized for a mental health condition. In addition, approximately one in four people with mental illness is hospitalized for a mental health condition. In addition, approximately one in four people with mental illness

🧠 Query: Describe the impact of COVID-19 on the global economy.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📄 Answer: Describe the impact of COVID-19 on the global economy.

The impact of COVID-19 on the global economy is well known. It has been shown that the effects of COVID-19 on the global economy are well known. It has been shown that the effects of COVID-19 on the global economy is well known. It has been shown that the effects of COVID-19 on the global economy are well known. It has been shown that the effects of CO

🧠 Query: What role do vaccines play in controlling the pandemic?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📄 Answer: What role do vaccines play in controlling the pandemic?

The role of vaccines in controlling the pandemic is well established. In the past, the role of vaccines has been largely ignored. In the present, however, the role of vaccines has been recognized. In the past, the role of vaccines has been largely ignored. In the present, however, the role of vaccines has been recognized. In the present, the role of vaccines has been recognized. In the present, the role of vaccines

🧠 Query: Explain the transmission mechanism of SARS-CoV-2.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📄 Answer: Explain the transmission mechanism of SARS-CoV-2.

SARS-CoV-2 is a viral RNA virus that is transmitted through the respiratory tract to the respiratory tract through the respiratory tract. It is a viral RNA virus that is transmitted through the respiratory tract to the respiratory tract through the respiratory tract. It is a viral RNA virus that is transmitted through the respiratory tract to the respiratory tract through the respiratory tract. It is a viral RNA virus that is transmitted through the respiratory

🧠 Query: How did COVID-19 affect healthcare systems in India?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📄 Answer: How did COVID-19 affect healthcare systems in India?

The first step in understanding the effects of COVID-19 on healthcare systems is to understand the mechanisms by which it affects healthcare systems. The first step is to understand the mechanisms by which it affects healthcare systems. The first step in understanding the mechanisms by which it affects healthcare systems is to understand the mechanisms by which it affects healthcare systems.

The first step in understanding the mechanisms by which it affects healthcare systems is to understand the

🧠 Query: Was there any research on mask effectiveness?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📄 Answer: Was there any research on mask effectiveness?

No, there was no research on mask effectiveness. The only study that we have done is to compare the effectiveness of different masks with the effectiveness of different masks. We have done this in a number of studies. We have also done this in a number of studies. We have also done this in a number of studies. We have also done this in a number of studies. We have also done this in a number of studies. We have also done

🧠 Query: What mutations have been identified in COVID-19 variants?
📄 Answer: What mutations have been identified in COVID-19 variants? The most common mutations are:

1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.


In [43]:
# Load the tokenizer using AutoTokenizer and specify the DistilGPT-2 model name
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # DistilGPT-2 uses the eos token as its pad token

def tokenize_function(examples):
    # Tokenize inputs and set labels for language modeling
    encoding = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    encoding["labels"] = encoding["input_ids"].copy()  # Required for Trainer to compute loss
    return encoding

# Tokenize and format the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

# Load the model and move it to GPU if available
model = GPT2LMHeadModel.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.eos_token_id
model = model.to(device)

# Define training arguments with GPU (and mixed precision if available)
training_args = TrainingArguments(
    output_dir="./distilgpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,                # Adjust as needed
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),    # Enable FP16 mixed precision on GPUs
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./distilgpt2-finetuned")
tokenizer.save_pretrained("./distilgpt2-finetuned")

# Define a simple text generation function for querying the model
def answer_query(query, model, tokenizer, max_length=100):
    model.eval()
    input_ids = tokenizer.encode(query, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Example query usage:
query = "What does the paper say about COVID-19 in Russia?"
print("DistilGPT-2 Answer:", answer_query(query, model, tokenizer))

Map: 100%|██████████| 100/100 [00:00<00:00, 397.21 examples/s]
 12%|█▏        | 6/50 [00:00<00:04, 10.77it/s]

{'loss': 3.7993, 'grad_norm': 8.838666915893555, 'learning_rate': 4.7e-05, 'epoch': 0.1}


 20%|██        | 10/50 [00:00<00:03, 10.95it/s]

{'loss': 3.9894, 'grad_norm': 10.019515037536621, 'learning_rate': 4.2e-05, 'epoch': 0.2}


 32%|███▏      | 16/50 [00:02<00:06,  5.04it/s]

{'loss': 3.8827, 'grad_norm': 9.032402992248535, 'learning_rate': 3.7e-05, 'epoch': 0.3}


 40%|████      | 20/50 [00:03<00:04,  6.98it/s]

{'loss': 3.9407, 'grad_norm': 10.409589767456055, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.4}


 52%|█████▏    | 26/50 [00:05<00:05,  4.72it/s]

{'loss': 4.0982, 'grad_norm': 9.586071968078613, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.5}


 60%|██████    | 30/50 [00:05<00:02,  6.70it/s]

{'loss': 3.9465, 'grad_norm': 9.354372024536133, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.6}


 72%|███████▏  | 36/50 [00:07<00:03,  4.53it/s]

{'loss': 3.7724, 'grad_norm': 9.076519966125488, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.7}


 80%|████████  | 40/50 [00:07<00:01,  6.43it/s]

{'loss': 3.7067, 'grad_norm': 9.850354194641113, 'learning_rate': 1.2e-05, 'epoch': 0.8}


 92%|█████████▏| 46/50 [00:09<00:00,  4.51it/s]

{'loss': 3.7834, 'grad_norm': 10.898308753967285, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.9}


100%|██████████| 50/50 [00:10<00:00,  6.44it/s]

{'loss': 3.8578, 'grad_norm': 9.900525093078613, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.0}


100%|██████████| 50/50 [00:11<00:00,  4.25it/s]


{'train_runtime': 11.7616, 'train_samples_per_second': 8.502, 'train_steps_per_second': 4.251, 'train_loss': 3.8776962661743166, 'epoch': 1.0}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


DistilGPT-2 Answer: What does the paper say about COVID-19 in Russia?

























































































In [44]:
# Initialize tokenizer and model for GPT-Neo using AutoTokenizer
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT-Neo uses the eos token as pad

def tokenize_function(examples):
    encoding = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

# Tokenize and format dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

# Load model and move to device
model = GPTNeoForCausalLM.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.eos_token_id
model = model.to(device)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./gptneo-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU available
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Fine-tune the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./gptneo-finetuned")
tokenizer.save_pretrained("./gptneo-finetuned")

# Define a simple text generation function
def answer_query(query, model, tokenizer, max_length=100):
    model.eval()
    input_ids = tokenizer.encode(query, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Example usage:
query = "Summarize the COVID-19 research findings from the paper."
print("GPT-Neo Answer:", answer_query(query, model, tokenizer))

Map: 100%|██████████| 100/100 [00:00<00:00, 365.30 examples/s]
 12%|█▏        | 6/50 [00:01<00:06,  6.35it/s]

{'loss': 2.9467, 'grad_norm': 7.538763046264648, 'learning_rate': 4.5e-05, 'epoch': 0.1}


 20%|██        | 10/50 [00:01<00:06,  6.62it/s]

{'loss': 3.0882, 'grad_norm': 9.881985664367676, 'learning_rate': 4e-05, 'epoch': 0.2}


 32%|███▏      | 16/50 [00:04<00:09,  3.75it/s]

{'loss': 2.9975, 'grad_norm': 9.502613067626953, 'learning_rate': 3.5e-05, 'epoch': 0.3}


 40%|████      | 20/50 [00:05<00:05,  5.61it/s]

{'loss': 2.9798, 'grad_norm': 7.901386260986328, 'learning_rate': 3e-05, 'epoch': 0.4}


 52%|█████▏    | 26/50 [00:08<00:06,  3.85it/s]

{'loss': 3.1388, 'grad_norm': 8.603156089782715, 'learning_rate': 2.5e-05, 'epoch': 0.5}


 60%|██████    | 30/50 [00:08<00:03,  5.59it/s]

{'loss': 2.9294, 'grad_norm': 7.441476821899414, 'learning_rate': 2e-05, 'epoch': 0.6}


 72%|███████▏  | 36/50 [00:11<00:03,  3.99it/s]

{'loss': 2.9809, 'grad_norm': 7.939748764038086, 'learning_rate': 1.5e-05, 'epoch': 0.7}


 80%|████████  | 40/50 [00:12<00:01,  5.79it/s]

{'loss': 2.9915, 'grad_norm': 8.829133033752441, 'learning_rate': 1e-05, 'epoch': 0.8}


 92%|█████████▏| 46/50 [00:15<00:01,  3.76it/s]

{'loss': 2.7363, 'grad_norm': 8.229199409484863, 'learning_rate': 5e-06, 'epoch': 0.9}


100%|██████████| 50/50 [00:16<00:00,  5.58it/s]

{'loss': 2.8677, 'grad_norm': 8.186396598815918, 'learning_rate': 0.0, 'epoch': 1.0}


100%|██████████| 50/50 [00:18<00:00,  2.75it/s]


{'train_runtime': 18.1667, 'train_samples_per_second': 5.505, 'train_steps_per_second': 2.752, 'train_loss': 2.9656890487670897, 'epoch': 1.0}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


GPT-Neo Answer: Summarize the COVID-19 research findings from the paper. The authors have reviewed the paper and have made a number of comments. The authors have also reviewed the paper and have made comments on the manuscript. The authors have reviewed the paper and have made comments on the manuscript. The authors have reviewed the paper and have made comments on the manuscript. The authors have reviewed the paper and have made comments on the manuscript. The authors have reviewed the paper and have made comments on the manuscript. The
