In [1]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

model.safetensors:  19%|#9        | 1.03G/5.31G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [18]:
import json
import random

# Load the original dataset
with open("intents.json", "r", encoding="utf-8") as f:
    data = json.load(f)

preprocessed_data = []

# Convert intent-based dataset to input-output pairs
for intent in data["intents"]:
    for text in intent["text"]:  # Loop through each user question
        if intent["responses"]:  # Ensure there are responses
            response = random.choice(intent["responses"])  # Pick one relevant response
            preprocessed_data.append({"input": text, "output": response})

# Save as JSON Lines (JSONL)
with open("dataset.jsonl", "w", encoding="utf-8") as f:
    for entry in preprocessed_data:
        f.write(json.dumps(entry) + "\n")

print("Preprocessing complete! Saved as dataset.jsonl.")


Preprocessing complete! Saved as dataset.jsonl.


In [19]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("json", data_files="dataset.jsonl")

# Set padding token
tokenizer.pad_token = tokenizer.eos_token  # Fix padding issue

# Tokenize the dataset
def preprocess_function(examples):
    inputs = [q for q in examples["input"]]
    targets = [r for r in examples["output"]]
    
    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/412 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-gpt-neo")
tokenizer.save_pretrained("./fine-tuned-gpt-neo")

Step,Training Loss


In [16]:
def generate_response(user_input):
    inputs = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=128,
        do_sample=True,  # Enable sampling
        temperature=0.9,  # Encourage creativity
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,  # Reduce repetition
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [17]:
# Test the chatbot
user_input = "What are the admission requirements?"
response = generate_response(user_input)
print(f"User: {user_input}")
print(f"Chatbot: {response}")

User: What are the admission requirements?
Chatbot: What are the admission requirements?


In [14]:
# Test with different inputs
questions = [
    "What are the admission requirements?",
    "Do I need to submit SAT scores for admission?",
    "What is the minimum GPA required for admission?",
    "How much is the tuition fee?",
    "What courses are offered in the Computer Science program?"
]

for question in questions:
    response = generate_response(question)
    print(f"User: {question}")
    print(f"Chatbot: {response}")
    print()

User: What are the admission requirements?
Chatbot: What are the admission requirements?

User: Do I need to submit SAT scores for admission?
Chatbot: Do I need to submit SAT scores for admission?

User: What is the minimum GPA required for admission?
Chatbot: What is the minimum GPA required for admission?

User: How much is the tuition fee?
Chatbot: How much is the tuition fee?

User: What courses are offered in the Computer Science program?
Chatbot: What courses are offered in the Computer Science program?



In [15]:
print(dataset["train"][0])  # Check if input-output pairs are correct


{'input': 'Hi', 'output': 'Hello!'}
