In [5]:
# Install Required Libraries
from datasets import Dataset
import evaluate

import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
from lime.lime_text import LimeTextExplainer
from sklearn.metrics import roc_auc_score
import torch


In [6]:

# Load Dataset
df = pd.read_csv("chat_data.csv")  # Ensure your dataset is in the same directory or provide the correct path

# Columns: id, human, gpt, cleaned_human, cleaned_gpt
# df = df[["cleaned_human", "cleaned_gpt"]].dropna()

df = df.sample(n=5000, random_state=42)

In [7]:
# Split Dataset for Training and Evaluation
df_train = df.sample(frac=0.8, random_state=42)  # 80% for training
df_eval = df.drop(df_train.index)  # 20% for evaluation

In [8]:
# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(df_train)
eval_dataset = Dataset.from_pandas(df_eval)


In [9]:

# Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [10]:
# Tokenize the Dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["human"], max_length=512, truncation=True, padding="max_length")
    outputs = tokenizer(examples["gpt"], max_length=512, truncation=True, padding="max_length")
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Apply the tokenize function to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["human", "gpt"])
eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["human", "gpt"])


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
# First Fine-Tuning Round
training_args_round1 = TrainingArguments(
    output_dir="./results_round1",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    logging_dir="./logs_round1"
)

trainer_round1 = Trainer(
    model=model,
    args=training_args_round1,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

print("Starting Round 1 Fine-Tuning...")
trainer_round1.train()



Starting Round 1 Fine-Tuning...


  0%|          | 0/3000 [00:00<?, ?it/s]

{'loss': 0.6804, 'grad_norm': 1.8200733661651611, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5}
{'loss': 0.5987, 'grad_norm': 3.172576427459717, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5896318554878235, 'eval_runtime': 130.8912, 'eval_samples_per_second': 7.64, 'eval_steps_per_second': 1.91, 'epoch': 1.0}
{'loss': 0.5786, 'grad_norm': 3.4807076454162598, 'learning_rate': 2.5e-05, 'epoch': 1.5}
{'loss': 0.574, 'grad_norm': 1.1021109819412231, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5779553651809692, 'eval_runtime': 133.5398, 'eval_samples_per_second': 7.488, 'eval_steps_per_second': 1.872, 'epoch': 2.0}
{'loss': 0.5578, 'grad_norm': 1.1667473316192627, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}
{'loss': 0.561, 'grad_norm': 2.0330874919891357, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5777411460876465, 'eval_runtime': 138.5322, 'eval_samples_per_second': 7.219, 'eval_steps_per_second': 1.805, 'epoch': 3.0}
{'train_runtime': 9628.3197, 'train_samples_per_second': 1.246, 'train_steps_per_second': 0.312, 'train_loss': 0.5917533467610677, 'epoch': 3.0}


TrainOutput(global_step=3000, training_loss=0.5917533467610677, metrics={'train_runtime': 9628.3197, 'train_samples_per_second': 1.246, 'train_steps_per_second': 0.312, 'total_flos': 3135504384000000.0, 'train_loss': 0.5917533467610677, 'epoch': 3.0})

In [12]:
# Save the model after the first round of fine-tuning
model.save_pretrained("fine_tuned_gpt2_round1")
tokenizer.save_pretrained("fine_tuned_gpt2_round1")


('fine_tuned_gpt2_round1\\tokenizer_config.json',
 'fine_tuned_gpt2_round1\\special_tokens_map.json',
 'fine_tuned_gpt2_round1\\vocab.json',
 'fine_tuned_gpt2_round1\\merges.txt',
 'fine_tuned_gpt2_round1\\added_tokens.json')

In [None]:

# Load the model for the second round
model_round2 = GPT2LMHeadModel.from_pretrained("fine_tuned_gpt2_round1")

# Second Fine-Tuning Round
training_args_round2 = TrainingArguments(
    output_dir="./results_round2",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    logging_dir="./logs_round2"
)

trainer_round2 = Trainer(
    model=model_round2,
    args=training_args_round2,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

print("Starting Round 2 Fine-Tuning...")
trainer_round2.train()

In [None]:
# Save the model after the second round of fine-tuning
model_round2.save_pretrained("fine_tuned_gpt2_round2")
tokenizer.save_pretrained("fine_tuned_gpt2_round2")

print("Two Rounds of Fine-Tuning Completed.")


In [15]:
import evaluate

# Load Metrics
rouge = evaluate.load("rouge")  # Use `evaluate.load` instead of `load_metric`
bert_scorer = evaluate.load("bertscore")  # Use `evaluate.load` instead of `load_metric`

# Example function to calculate ROUGE-L and BERTScore
def evaluate_responses(eval_dataset, model, tokenizer):
    inputs = eval_dataset["cleaned_human"]
    references = eval_dataset["cleaned_gpt"]
    predictions = []

    for inp in inputs:
        input_ids = tokenizer(inp, return_tensors="pt").input_ids.to(model.device)
        output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1, pad_token_id=50256)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)

    # ROUGE-L
    rouge_results = rouge.compute(predictions=predictions, references=references)
    print("ROUGE-L Score:", rouge_results["rougeL"])

    # BERTScore
    bert_results = bert_scorer.compute(predictions=predictions, references=references, lang="en")
    print("BERTScore F1:", bert_results["f1"])


In [16]:
import evaluate

rouge = evaluate.load("rouge")
bert_scorer = evaluate.load("bertscore")

# Function to Evaluate Responses
def evaluate_responses(eval_dataset, model, tokenizer):
    """
    Evaluate the responses generated by the model using ROUGE-L and BERTScore.

    Args:
        eval_dataset: The dataset containing "human" and "gpt" columns.
        model: The pre-trained language model to evaluate.
        tokenizer: The tokenizer corresponding to the model.

    Returns:
        None
    """
    # Ensure eval_dataset is properly formatted
    if not all(col in eval_dataset.column_names for col in ["human", "gpt"]):
        raise ValueError("The dataset must have 'human' and 'gpt' columns.")

    # Prepare inputs and references
    inputs = eval_dataset["human"]  # List of human inputs
    references = eval_dataset["gpt"]  # List of reference GPT outputs
    predictions = []

    # Generate predictions
    print("Generating predictions...")
    for inp in inputs:
        input_ids = tokenizer(inp, return_tensors="pt", truncation=True, padding=True).input_ids.to(model.device)
        output_ids = model.generate(
            input_ids,
            max_length=50,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id
        )
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)

    # Evaluate ROUGE-L
    print("Computing ROUGE-L...")
    rouge_results = rouge.compute(predictions=predictions, references=references)
    print("ROUGE-L Score:", rouge_results["rougeL"])

    # Evaluate BERTScore
    print("Computing BERTScore...")
    bert_results = bert_scorer.compute(predictions=predictions, references=references, lang="en")
    print("BERTScore Precision:", bert_results["precision"])
    print("BERTScore Recall:", bert_results["recall"])
    print("BERTScore F1:", bert_results["f1"])

# Example usage:
# evaluate_responses(eval_dataset, model, tokenizer)


In [17]:

# import evaluate
# # Metrics Calculation (ROUGE-L and BERTScore)
# rouge = load_metric("rouge")
# bert_scorer = load_metric("bertscore")

# # Evaluate Responses
# def evaluate_responses(eval_dataset, model, tokenizer):
#     inputs = eval_dataset["cleaned_human"]
#     references = eval_dataset["cleaned_gpt"]
#     predictions = []
    
#     for inp in inputs:
#         input_ids = tokenizer(inp, return_tensors="pt").input_ids.to(model.device)
#         output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1, pad_token_id=50256)
#         pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#         predictions.append(pred)
    
#     # ROUGE-L
#     rouge_results = rouge.compute(predictions=predictions, references=references)
#     print("ROUGE-L Score:", rouge_results["rougeL"])
    
#     # BERTScore
#     bert_results = bert_scorer.compute(predictions=predictions, references=references, lang="en")
#     print("BERTScore F1:", bert_results["f1"])

# print("Evaluating Chatbot...")
# evaluate_responses(df_eval, model_round2, tokenizer)


In [18]:
# from lime.lime_text import LimeTextExplainer
# import torch

# # Initialize the explainer
# explainer = LimeTextExplainer(class_names=["Response Quality"])

# def explain_prediction(text, model, tokenizer):
#     # Explicitly move the model to CPU
#     model = model.to("cpu")
    
#     def predict_proba(inputs):
#         # Tokenize inputs with attention mask and left padding
#         encoding = tokenizer(inputs, return_tensors="pt", padding='longest', truncation=True, return_attention_mask=True, padding_side='left')
        
#         input_ids = encoding.input_ids.to("cpu")  # Move to CPU
#         attention_mask = encoding.attention_mask.to("cpu")  # Move attention mask to CPU
        
#         # Generate output tokens
#         with torch.no_grad():  # Avoid gradient tracking
#             generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50)
        
#         # Convert generated_ids to probabilities (or you can directly return them)
#         return generated_ids.detach().cpu().numpy()  # Return token IDs as output for LIME

#     # Explain the instance
#     explanation = explainer.explain_instance(text, predict_proba, num_features=10)
#     explanation.show_in_notebook()

# # Example usage
# print("Explaining a response with LIME...")
# explain_prediction("Why is the sky blue?", model_round2, tokenizer)


In [19]:
# from sklearn.metrics import average_precision_score, roc_auc_score
# import torch

# def calculate_auc(model, eval_dataset, tokenizer):
#     model.eval()  # Set the model to evaluation mode
#     true_labels = []
#     probabilities = []

#     # Example: Define token IDs for class 1 (positive class) and class 0 (negative class)
#     class_1_token_ids = [100, 200, 300]  # Replace with actual token IDs for class 1
#     class_0_token_ids = [400, 500, 600]  # Replace with actual token IDs for class 0

#     for batch in eval_dataset:
#         input_ids = torch.tensor(batch["input_ids"]).to(model.device)  # Move tensor to device
#         attention_mask = torch.tensor(batch["attention_mask"]).to(model.device)  # Move tensor to device
#         labels = batch["labels"]  # Assuming labels are token IDs
        
#         # Inspecting the labels to ensure correct mapping
#         print("Sample of labels in batch:", labels[:20])  # Print first 20 labels to inspect

#         # Map token IDs to binary labels: 1 for class_1_token_ids, 0 for others
#         binary_labels = [1 if token_id in class_1_token_ids else 0 for token_id in labels]
        
#         # Check label distribution in the batch
#         print("Label distribution in batch:", torch.bincount(torch.tensor(binary_labels)))
        
#         with torch.no_grad():  # Disable gradient computation
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)

#         logits = outputs.logits.detach().cpu().numpy()  # Extract logits on CPU
#         prob = torch.sigmoid(torch.tensor(logits)).numpy()[:, 1]  # Probability for class 1
#         probabilities.extend(prob)
        
#         true_labels.extend(binary_labels)  # Use the binary labels

#     # Ensure the length matches
#     assert len(true_labels) == len(probabilities), "Mismatch in lengths of true labels and probabilities"

#     # Flatten labels and probabilities to 1D arrays
#     true_labels = torch.tensor(true_labels).flatten()
#     probabilities = torch.tensor(probabilities).flatten()

#     # Check the distribution of labels
#     print("Distribution of true labels:", torch.bincount(true_labels))

#     # If only one class is present, use average precision score instead
#     if len(torch.unique(true_labels)) == 1:
#         print("Only one class is present in true labels. Using Precision-Recall AUC instead.")
        
#         # Inspect the true labels and predicted probabilities for debugging
#         print("True labels:", true_labels[:20])  # Print first 20 true labels
#         print("Predicted probabilities:", probabilities[:20])  # Print first 20 probabilities
        
#         auc_score = average_precision_score(true_labels, probabilities)
#         print("AUC-PR Score:", auc_score)
#     else:
#         # Calculate the AUC score
#         auc_score = roc_auc_score(true_labels, probabilities)
#         print("AUC Score:", auc_score)

# # Call the function to calculate AUC
# calculate_auc(model_round2, eval_dataset, tokenizer)


In [20]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("D:/Downloads/chatts/fine_tuned_gpt2_round1")
tokenizer = AutoTokenizer.from_pretrained("D:/Downloads/chatts/fine_tuned_gpt2_round1")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at D:/Downloads/chatts/fine_tuned_gpt2_round1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from evaluate import load

rouge = load("rouge")
rouge_result = rouge.compute(predictions=["generated text"], references=["expected text"])
print("ROUGE:", rouge_result)

# For BERTScore
import bert_score

P, R, F1 = bert_score.score(["generated text"], ["expected text"], lang="en")
print("BERTScore F1:", F1)


ROUGE: {'rouge1': 0.5, 'rouge2': 0.0, 'rougeL': 0.5, 'rougeLsum': 0.5}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: tensor([0.9965])


In [22]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT2 language model (for text generation)
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Get user input
user_input = input("Please enter your query: ")

# Tokenizing the input
inputs = tokenizer(user_input, return_tensors="pt")

# Generating a response (using model.generate for text generation)
outputs = model.generate(
    inputs['input_ids'],  # Input tokens
    max_length=50,  # Limit response length (can adjust as needed)
    num_return_sequences=1,  # You can adjust how many responses to generate
    no_repeat_ngram_size=2,  # Prevent repeating n-grams (optional)
    top_p=0.95,  # Nucleus sampling (optional)
    temperature=0.7,  # Controls randomness (optional)
)

# Decoding the generated tokens into text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Output the generated response
print("Chatbot Response:", response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Chatbot Response: hi , how are you doing?

Karen: I'm doing fine. I've been working on my game for a while now.
 (laughs)
. . .
, how do you feel about the new game? Do you


In [23]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the pad token to the eos token
tokenizer.pad_token = tokenizer.eos_token

# Get user input
user_input = input("Please enter your query: ")

# Add a prompt for better context
prompt = f"You are a helpful and polite assistant. Answer the following question: {user_input}"

# Tokenizing the input
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

# Generate response with improved settings
outputs = model.generate(
    inputs['input_ids'],
    attention_mask=inputs['attention_mask'],  # Fix attention mask warning
    max_length=150,
    do_sample=True,  # Enable sampling for diverse output
    top_p=0.95,      # Nucleus sampling
    temperature=0.7, # Adds randomness
    pad_token_id=tokenizer.pad_token_id,  # Use the defined pad token
)

# Decoding the generated tokens into text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Output the generated response
print("Chatbot Response:", response)


Chatbot Response: You are a helpful and polite assistant. Answer the following question: i am not feeling good. I am feeling very good and feeling very strong. I am feeling strong and feeling very excited. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am feeling very strong and feeling very happy. I am


In [25]:
# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained conversational model (DialoGPT)
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Initialize conversation history
conversation_history = []

# Chatbot function
def chatbot_response(user_input):
    global conversation_history
    
    # Add user input to conversation history
    conversation_history.append(f"User: {user_input}")
    
    # Prepare the model input
    input_text = "\n".join(conversation_history[-5:])  # Keep last 5 exchanges
    input_ids = tokenizer.encode(input_text + "\nBot:", return_tensors="pt")
    
    # Generate response
    output_ids = model.generate(
        input_ids,
        max_length=250,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,           # Adjust randomness
        top_p=0.9,                 # Nucleus sampling for variety
        do_sample=True,            # Enable sampling
        repetition_penalty=1.2     # Penalize repetitive responses
    )
    
    # Decode the output and add to history
    bot_response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("Bot:")[-1].strip()
    conversation_history.append(f"Bot: {bot_response}")
    
    # Limit conversation history
    if len(conversation_history) > 20:
        conversation_history = conversation_history[-20:]
    
    return bot_response

# Main interaction loop
if __name__ == "__main__":
    print("Chatbot is ready! Type 'exit' to end the conversation.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Chatbot: Goodbye!")
            break
        
        response = chatbot_response(user_input)
        
        print(f"Chatbot: {response}")


Chatbot is ready! Type 'exit' to end the conversation.
Chatbot: I know what you mean.

 ______________________________________________________________________________ [12/6] - Last edited by Dazzle on Sep 18, 2016 at 12 :45 PM; Edited 2 times in total •
Chatbot: You're right! My life is full of amazing things to come from this game and it's wonderful that my character can't be found because she was born here...I don´t have a clue where her parents came From or how long ago they were taken away after being kidnapped but when we finally got there our story will tell the whole world about us :) "And now hetery-dandy" This message has been deleted .
Chatbot: It really doesn� t matter if its hot (unless your car doesnt run) ...it always does anyway with me !oftheweather!!!
Chatbot: And you know what I mean? A lot of people in New York City who dont want their cars parked on fire all day. The city gets so crowded, everyone drives into traffic at night like crazy everytime someone comes out f

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np
import matplotlib.pyplot as plt
import torch

# Evaluate the model
eval_results = trainer_round2.predict(eval_dataset)
predictions = eval_results.predictions
labels = eval_results.label_ids

# If logits need to be converted to probabilities
if len(predictions.shape) > 1:  # Multi-class classification
    probabilities = torch.nn.functional.softmax(torch.tensor(predictions), dim=1).numpy()
else:  # Binary classification
    probabilities = torch.sigmoid(torch.tensor(predictions)).numpy()

# Compute AUC (multi-class example)
auc = roc_auc_score(labels, probabilities, multi_class="ovr")

# Compute ROC curve for a specific class (optional, for binary class or one-vs-rest setup)
fpr, tpr, _ = roc_curve(labels.ravel(), probabilities.ravel())

# Plot the ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid()
plt.show()


NameError: name 'trainer_round2' is not defined