In [None]:
pip install accelerate -U

In [4]:
# prompt: read /content/JEOPARDY_CSV.csv predict Answer train model

import pandas as pd
df = pd.read_csv('/content/JEOPARDY_CSV.csv')
df['input_text'] = df[' Question'] + " " + df[' Answer']
text=df['input_text'].astype(str)
text

0         For the last 8 years of his life, Galileo was ...
1         No. 2: 1912 Olympian; football star at Carlisl...
2         The city of Yuma in this state has a record av...
3         In 1963, live on "The Art Linkletter Show", th...
4         Signer of the Dec. of Indep., framer of the Co...
                                ...                        
216925    This Puccini opera turns on the solution to 3 ...
216926    In North America this term is properly applied...
216927    In Penny Lane, where this "Hellraiser" grew up...
216928    From Ft. Sill, Okla. he made the plea, Arizona...
216929    A silent movie title includes the last name of...
Name: input_text, Length: 216930, dtype: object

In [8]:
text[0]

"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory Copernicus"

In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token
# Tokenize the input_text column
tokenized_inputs = tokenizer(
    text.tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt"
)



In [8]:
import torch
from torch.utils.data import Dataset

# Define a custom dataset
class JeopardyDataset(Dataset):
    def __init__(self, tokenized_inputs):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

# Create an instance of the custom dataset
jeopardy_dataset = JeopardyDataset(tokenized_inputs)


In [9]:
from transformers import DataCollatorForLanguageModeling

# Assuming tokenizer is already defined
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to True if you are using a masked language model (e.g., BERT)
)

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

# Create a DataLoader
train_dataloader = DataLoader(jeopardy_dataset, batch_size=4, shuffle=True)


# Fine-tune the model
training_args = TrainingArguments(
    output_dir="./gpt2-jeopardy-fine-tuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,  # Use the data collator from the transformers library
    train_dataset=jeopardy_dataset,
)

trainer.train()

Step,Training Loss
500,4.2167
1000,4.0793
1500,4.0375
2000,4.0254
2500,3.9798
3000,3.9707
3500,3.9241
4000,3.868
4500,3.8682
5000,3.8937


KeyboardInterrupt: ignored

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model from the saved checkpoint
model = GPT2LMHeadModel.from_pretrained("/content/gpt2-jeopardy-fine-tuned/checkpoint-20000")

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Generate responses
prompt = "For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate text using the model
output = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory of the moon Galileo Galilei (Galileo accepted) Copernicus (Cocle) de Filippo (De Medici accepted


In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Unzip the fine-tuned model if it's in a ZIP file
import zipfile
with zipfile.ZipFile("/content/drive/MyDrive/gpt2-jeopardy-fine-tuned.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/gpt2-jeopardy-fine-tuned")

# Load the trained model from the saved checkpoint
model = GPT2LMHeadModel.from_pretrained("/content/gpt2-jeopardy-fine-tuned/checkpoint-20000")

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Generate responses
prompt = "In 1492, Columbus sailed the ocean blue."
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate text using the model
output = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In 1492, Columbus sailed the ocean blue.  It was the first time he had sailed this ocean the Atlantic Ocean (or the Pacific Ocean) the Indian Ocean/Indochina/South America/North Africa/New Guinea/Netherlands


In [11]:
import shutil


# Zip the folder
folder_path = "/content/gpt2-jeopardy-fine-tuned"
shutil.make_archive("/content/gpt2-jeopardy-fine-tuned", 'zip', folder_path)



'/content/gpt2-jeopardy-fine-tuned.zip'

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [13]:
# Move to Google Drive
drive_folder_path = "/content/drive/MyDrive/Colab Notebooks"  # Change this path based on your Google Drive folder structure
shutil.move('/content/gpt2-jeopardy-fine-tuned.zip', drive_folder_path)

FileNotFoundError: ignored