In [1]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from textblob import TextBlob
import replicate
import os
import pandas as pd

In [2]:
chatbot_dataset = pd.read_csv("test_dataset.csv")

chatbot_dataset.head(5)

Unnamed: 0,Problem Statement,User Input,Expected Sentiment,Emotion Context,Expected Response,Step in Conversation
0,7/4 * 8/5,Can you help me start this problem?,Neutral,Unsure,Let's go through it step-by-step.,2
1,7/4 * 8/5,Is my answer correct:,Neutral,Checking,Let's go through it step-by-step.,3
2,7/4 * 8/5,This makes no sense to me at all!,Negative,Frustrated,"Don't worry, let's figure it out together.",3
3,7/4 * 8/5,I'm not sure why we need to find a common deno...,Neutral,Curious,Let's go through it step-by-step.,3
4,7/4 * 8/5,Here's what I tried:,Neutral,Explaining,Let's go through it step-by-step.,3


In [3]:
os.environ["REPLICATE_API_TOKEN"] = "r8_VTIlJKo4ybjVLEMSa2BCg7HiYNOK9UA1BtOsc"

In [4]:
# Ensure NLTK components are downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chinu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# # Helper function to simulate chatbot response using the deployed chatbot
# def simulate_chatbot_response(user_input):
#     # Use Replicate API to get the chatbot's response
#     model = replicate.models.get("meta/llama-2-7b-chat")
#     version = model.versions.get("latest")
#     response = version.predict(prompt=user_input)
#     return response["generated_text"] if "generated_text" in response else "Error in response"


In [6]:
# def simulate_chatbot_response(user_input):
#     try:
#         # Fetch the model and version
#         model = replicate.models.get("meta/llama-2-7b-chat")
#         version = model.versions.get("latest")
        
#         # Generate response
#         response = version.predict(prompt=user_input)
#         return response.get("generated_text", "Error: No generated_text found in response")
#     except replicate.exceptions.ReplicateError as e:
#         return f"ReplicateError occurred: {e}"
#     except Exception as e:
#         return f"An unexpected error occurred: {e}"


def simulate_chatbot_response(user_input):
    """
    Simulate the chatbot's response using the LLAMA 2 model.
    """
    try:
        response_text = ""
        for event in replicate.stream(
            "meta/llama-2-7b-chat",
            input={"prompt": user_input},  # Adjust input key if required
        ):
            response_text += str(event)
        return response_text.strip()
    except replicate.exceptions.ReplicateError as e:
        return f"ReplicateError occurred: {e}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"

In [7]:
# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

In [8]:
# Metrics storage
bleu_scores = []
rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}
sentiment_matches = 0

In [14]:
def evaluate_chatbot(dataset):
    global sentiment_matches
    for index, row in dataset.iterrows():
        user_input = row['User Input']
        expected_response = row['Expected Response']
        expected_sentiment = row['Expected Sentiment']
#         print(f"User Input: {user_input}")
        
#         print(f"Expected Response: {expected_response}")


        # Simulate chatbot response
        chatbot_response = simulate_chatbot_response(user_input)
#         print(f"Chatbot Response: {chatbot_response}")

        # Calculate BLEU score
        reference = nltk.word_tokenize(expected_response.lower())
        candidate = nltk.word_tokenize(chatbot_response.lower())
        bleu = sentence_bleu([reference], candidate)
        bleu_scores.append(bleu)

        # Calculate ROUGE scores
        rouge = scorer.score(expected_response, chatbot_response)
        for key in rouge_scores:
            rouge_scores[key].append(rouge[key].fmeasure)

        # Sentiment analysis
        detected_sentiment = "Positive" if TextBlob(chatbot_response).sentiment.polarity > 0 else "Negative"
        if detected_sentiment == expected_sentiment:
            sentiment_matches += 1


In [17]:
sample_data = chatbot_dataset.head(10)

In [18]:
# Run evaluation
evaluate_chatbot(sample_data)

In [21]:
# Summarize results
average_bleu = sum(bleu_scores) / len(bleu_scores)
average_rouge = {key: sum(values) / len(values) for key, values in rouge_scores.items()}
sentiment_accuracy = (sentiment_matches / len(chatbot_dataset)) * 100

# Print results
print("Evaluation Results:")
print(f"Average BLEU Score: {average_bleu}")
print("Average ROUGE Scores:")
for key, value in average_rouge.items():
    print(f"  {key}: {value}")
print(f"Sentiment Alignment Accuracy: {sentiment_accuracy}%")

Evaluation Results:
Average BLEU Score: 5.660657563081191e-81
Average ROUGE Scores:
  rouge1: 0.027206853231349897
  rouge2: 0.0021046246717091525
  rougeL: 0.022797902749095503
Sentiment Alignment Accuracy: 6.583333333333333%


In [22]:
#Experiment with different ways of phrasing the input prompts to improve the chatbot's responses. 
#This approach leverages natural language's flexibility to guide the model's outputs.

In [23]:
# def evaluate_prompts(dataset, prompt_variations):
#     """
#     Evaluate different prompt variations against the test dataset.
#     """
#     results = {}
    
#     for prompt_template in prompt_variations:
#         bleu_scores = []
#         rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}
#         sentiment_matches = 0
        
#         for index, row in dataset.iterrows():
#             user_input = row["User Input"]
#             expected_response = row["Expected Response"]
#             expected_sentiment = row["Expected Sentiment"]
            
#             # Apply prompt template
#             prompt = prompt_template.format(user_input=user_input)
            
#             # Simulate chatbot response
#             chatbot_response = simulate_chatbot_response(prompt)
            
#             # BLEU score
#             reference = nltk.word_tokenize(expected_response.lower())
#             candidate = nltk.word_tokenize(chatbot_response.lower())
#             bleu = sentence_bleu([reference], candidate)
#             bleu_scores.append(bleu)
            
#             # ROUGE scores
#             rouge = scorer.score(expected_response, chatbot_response)
#             for key in rouge_scores:
#                 rouge_scores[key].append(rouge[key].fmeasure)
            
#             # Sentiment alignment
#             detected_sentiment = (
#                 "Positive" if TextBlob(chatbot_response).sentiment.polarity > 0 else "Negative"
#             )
#             if detected_sentiment == expected_sentiment:
#                 sentiment_matches += 1
        
#         # Store results for this prompt
#         results[prompt_template] = {
#             "Average BLEU": sum(bleu_scores) / len(bleu_scores),
#             "Average ROUGE": {key: sum(scores) / len(scores) for key, scores in rouge_scores.items()},
#             "Sentiment Accuracy": sentiment_matches / len(dataset) * 100,
#         }
    
#     return results

# # Define prompt variations
# prompt_variations = [
#     "{user_input}",
#     "Explain step-by-step: {user_input}",
#     "I struggle with this topic: {user_input}. Can you help?",
#     "Help me reduce my anxiety and understand: {user_input}",
# ]

# # Example usage
# results = evaluate_prompts(chatbot_dataset, prompt_variations)
# print("Prompt Engineering Results:", results)


In [24]:
def evaluate_prompts(dataset, prompt_variations):
    """
    Evaluate different prompt templates and compare their performance.
    """
    results = {}
    
    for prompt_template in prompt_variations:
        bleu_scores = []
        rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}
        sentiment_matches = 0
        
        for _, row in dataset.iterrows():
            user_input = row["User Input"]
            expected_response = row["Expected Response"]
            expected_sentiment = row["Expected Sentiment"]
            
            # Apply prompt template
            prompt = prompt_template.format(user_input=user_input)
            
            # Get chatbot response
            chatbot_response = simulate_chatbot_response(prompt)
#             print(f"Prompt: {prompt} => Response: {chatbot_response}")
            
            # BLEU score
            reference = nltk.word_tokenize(expected_response.lower())
            candidate = nltk.word_tokenize(chatbot_response.lower())
            bleu = sentence_bleu([reference], candidate)
            bleu_scores.append(bleu)
            
            # ROUGE scores
            rouge = scorer.score(expected_response, chatbot_response)
            for key in rouge_scores:
                rouge_scores[key].append(rouge[key].fmeasure)
            
            # Sentiment alignment
            detected_sentiment = (
                "Positive" if TextBlob(chatbot_response).sentiment.polarity > 0 else "Negative"
            )
            if detected_sentiment == expected_sentiment:
                sentiment_matches += 1

        # Store average metrics for the prompt template
        results[prompt_template] = {
            "Average BLEU": sum(bleu_scores) / len(bleu_scores),
            "Average ROUGE-1": sum(rouge_scores["rouge1"]) / len(rouge_scores["rouge1"]),
            "Average ROUGE-2": sum(rouge_scores["rouge2"]) / len(rouge_scores["rouge2"]),
            "Average ROUGE-L": sum(rouge_scores["rougeL"]) / len(rouge_scores["rougeL"]),
            "Sentiment Accuracy": (sentiment_matches / len(dataset)) * 100,
        }
    return results


prompt_variations = [
    "Explain step-by-step how to handle this: {user_input}",
    "I feel nervous about {user_input}. Can you help?",
    "Please provide a simple explanation for: {user_input}",
    "How do I solve {user_input}? Give an easy method.",
]

# Evaluate prompts
prompt_results = evaluate_prompts(sample_data, prompt_variations)
for template, metrics in prompt_results.items():
    print(f"Template: {template}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")


Template: Explain step-by-step how to handle this: {user_input}
  Average BLEU: 2.8521174323139885e-156
  Average ROUGE-1: 0.03496420008801025
  Average ROUGE-2: 0.00523020237789709
  Average ROUGE-L: 0.02604780761728564
  Sentiment Accuracy: 10.0
Template: I feel nervous about {user_input}. Can you help?
  Average BLEU: 3.864939236718779e-156
  Average ROUGE-1: 0.03846004314325603
  Average ROUGE-2: 0.0031747799933162526
  Average ROUGE-L: 0.029041189765154247
  Sentiment Accuracy: 10.0
Template: Please provide a simple explanation for: {user_input}
  Average BLEU: 1.6916991823740183e-156
  Average ROUGE-1: 0.0439910720800359
  Average ROUGE-2: 0.001351351351351351
  Average ROUGE-L: 0.029925088166378584
  Sentiment Accuracy: 10.0
Template: How do I solve {user_input}? Give an easy method.
  Average BLEU: 2.809547325219266e-156
  Average ROUGE-1: 0.03122744171629354
  Average ROUGE-2: 0.003058883604658253
  Average ROUGE-L: 0.021046222594307797
  Sentiment Accuracy: 10.0


In [25]:
#Sentiment Analysis Variations

In [26]:
def evaluate_with_without_sentiment(dataset, use_sentiment=True):
    """
    Evaluate the chatbot with and without sentiment analysis.
    """
    bleu_scores = []
    rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}
    sentiment_matches = 0
    
    for index, row in dataset.iterrows():
        user_input = row["User Input"]
        expected_response = row["Expected Response"]
        expected_sentiment = row["Expected Sentiment"]
        
        # Simulate chatbot response
        chatbot_response = simulate_chatbot_response(user_input)
        
        # BLEU score
        reference = nltk.word_tokenize(expected_response.lower())
        candidate = nltk.word_tokenize(chatbot_response.lower())
        bleu = sentence_bleu([reference], candidate)
        bleu_scores.append(bleu)
        
        # ROUGE scores
        rouge = scorer.score(expected_response, chatbot_response)
        for key in rouge_scores:
            rouge_scores[key].append(rouge[key].fmeasure)
        
        if use_sentiment:
            # Sentiment alignment
            detected_sentiment = (
                "Positive" if TextBlob(chatbot_response).sentiment.polarity > 0 else "Negative"
            )
            if detected_sentiment == expected_sentiment:
                sentiment_matches += 1
    
    return {
        "Average BLEU": sum(bleu_scores) / len(bleu_scores),
        "Average ROUGE": {key: sum(scores) / len(scores) for key, scores in rouge_scores.items()},
        "Sentiment Accuracy": sentiment_matches / len(dataset) * 100 if use_sentiment else "N/A",
    }

# Example usage
results_with_sentiment = evaluate_with_without_sentiment(sample_data, use_sentiment=True)
results_without_sentiment = evaluate_with_without_sentiment(sample_data, use_sentiment=False)

print("Results with Sentiment Analysis:", results_with_sentiment)
print("Results without Sentiment Analysis:", results_without_sentiment)


Results with Sentiment Analysis: {'Average BLEU': 6.867738152376117e-232, 'Average ROUGE': {'rouge1': 0.022627504055925278, 'rouge2': 0.0, 'rougeL': 0.01675928168118266}, 'Sentiment Accuracy': 10.0}
Results without Sentiment Analysis: {'Average BLEU': 1.7552116553749679e-156, 'Average ROUGE': {'rouge1': 0.02246129729009667, 'rouge2': 0.0013422818791946306, 'rougeL': 0.01725748363401081}, 'Sentiment Accuracy': 'N/A'}
