In [14]:

!pip install datasets





In [1]:
!pip install --upgrade pyarrow
!pip uninstall pyarrow
!pip install pyarrow

!pip install --upgrade datasets
!pip install datasets
!pip install transformers
!pip install torch

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch

# Load the GPT-2 tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add a padding token to the tokenizer
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

# Load and preprocess the Persona-Chat dataset
dataset = load_dataset("google/Synthetic-Persona-Chat")
train_data = dataset['train'].select(range(50))

# Convert the dataset to the format required for GPT-2
def convert_to_text_examples(example):
    return {"text": f"Context: {example['Best Generated Conversation']} Response:"}

train_data = train_data.map(convert_to_text_examples)

# Tokenize the dataset
def tokenize_function(examples):
    encodings = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    encodings['labels'] = encodings['input_ids'].clone()  # GPT-2 language model uses the same input as labels
    return encodings

train_data = train_data.map(tokenize_function, batched=True)

# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 does not use masked language modeling
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the model and tokenizer after training
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')

print("Model fine-tuning complete and saved to './fine-tuned-gpt2'")


Found existing installation: pyarrow 17.0.0
Uninstalling pyarrow-17.0.0:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/benchmarks/*
    /usr/local/lib/python3.10/dist-packages/cmake_modules/AWSSDKVariables.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/BuildUtils.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/DefineOptions.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindAWSSDKAlt.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindAzure.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindBrotliAlt.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindClangTools.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindGTestAlt.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindInferTools.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindLLVMAlt.cmake
    /usr/local/lib/python3.10/dist-packages/cmake_modules/FindOpenSSLAlt.cmake
 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8938 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/968 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Step,Training Loss
10,1.7696
20,1.4229
30,1.3341
40,1.0774
50,1.0682
60,1.0393
70,0.9207


Model fine-tuning complete and saved to './fine-tuned-gpt2'


In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load the fine-tuned model and tokenizer
model_name = "./fine-tuned-gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

def generate_response(prompt, max_length=50, num_return_sequences=1):
    # Encode the input prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt")

    # Generate a response
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,  # Use sampling for diverse responses
            top_k=7,        # Top-k sampling
            top_p=0.95,
            temperature=0.7# Top-p (nucleus) sampling
        )

    # Decode and return the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def chat_with_model():
    print("You can start chatting with the model (type 'quit' to exit).")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "quit":
           # print("Exiting chat.")
            break

        response = generate_response(user_input)
        print(f"Model: {response}")

# Start the chat interface
if __name__ == "__main__":
    chat_with_model()


You can start chatting with the model (type 'quit' to exit).
You: how are you
Model: how are you doing for fun?
I'm doing a lot of different things for fun, and I've been doing a lot of different things for fun too. I've been doing a lot of different things for fun too. I've been doing
You: quit
