In [25]:
!pip3 install numpy scikit-learn nltk gensim python-Levenshtein levenshtein openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [33]:
# Cosine Similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from difflib import SequenceMatcher
from dotenv import load_dotenv
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import numpy as np
import os

load_dotenv(".env")

openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def chat_completion(
    model, messages, max_tokens=500, temperature=0.7, stream=False, on_stream=None
):
    completion = openai_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        stream=stream,
    )

    if stream:
        response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                on_stream(chunk.choices[0].delta.content)
                response = response + chunk.choices[0].delta.content

        return response

    return completion.choices[0].message.content.strip(), completion

# GENERATE OUTPUTS

n = 5
model = "gpt-3.5-turbo"
prompts = [{"role": "system", "content": "write a poem about france"}]

def fetch_completion(index):
    completion, _ = chat_completion(model, prompts, temperature=1)
    return completion

outputs = []
with ThreadPoolExecutor(max_workers=n) as executor:
    future_to_completion = {executor.submit(fetch_completion, i): i for i in range(n)}
    for future in concurrent.futures.as_completed(future_to_completion):
        outputs.append(future.result())

print("[+] Generated outputs:", outputs)

# Range: 0 - 1
def average_cosine_similarity(outputs):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(outputs)
    cos_sim_matrix = cosine_similarity(tfidf_matrix)
    
    # Exclude diagonal elements and calculate the average similarity
    np.fill_diagonal(cos_sim_matrix, 0)
    avg_sim = np.sum(cos_sim_matrix) / (cos_sim_matrix.size - len(outputs))
    
    return avg_sim

print(average_cosine_similarity(outputs))

# BLEU Scores

# Range: 0 - 1
def average_bleu_scores(outputs):
    scores = []
    smoothie = SmoothingFunction().method4  # Using smoothing method 4 as an example
    for i, candidate in enumerate(outputs):
        references = [outputs[:i] + outputs[i + 1 :]]
        tokenized_candidate = word_tokenize(candidate)
        tokenized_references = [word_tokenize(ref) for ref in references[0]]
        score = sentence_bleu(
            tokenized_references, tokenized_candidate, smoothing_function=smoothie
        )
        scores.append(score)

    return sum(scores) / len(scores)


print(average_bleu_scores(outputs))

# Jaccard Similarity

def average_jaccard_similarity(outputs):
    scores = []
    for i in range(len(outputs)):
        for j in range(i+1, len(outputs)):
            set1 = set(outputs[i].split())
            set2 = set(outputs[j].split())
            intersection = set1.intersection(set2)
            union = set1.union(set2)
            score = len(intersection) / len(union)
            scores.append(score)
    
    return sum(scores) / len(scores)

print(average_jaccard_similarity(outputs))

# Levenshtein Distance (Edit Distance)

# def average_levenshtein_distance(outputs):
#     scores = []
#     for i in range(len(outputs)):
#         for j in range(i+1, len(outputs)):
#             dist = levenshtein_distance(outputs[i], outputs[j])
#             max_len = max(len(outputs[i]), len(outputs[j]))
#             normalized_dist = dist / max_len
#             scores.append(normalized_dist)

#     return np.mean(scores)

# print(average_levenshtein_distance(outputs))

def average_sequence_matcher_distance(outputs):
    scores = []
    for i in range(len(outputs)):
        for j in range(i + 1, len(outputs)):
            matcher = SequenceMatcher(None, outputs[i], outputs[j])
            scores.append(1 - matcher.ratio())  # Subtract from 1 to represent distance

    return np.mean(scores)


print(average_sequence_matcher_distance(outputs))

[+] Generated outputs: ["Oh France, land of beauty and romance,\nWhere vineyards stretch and sunflowers dance.\nWith Paris as your dazzling crown,\nYour history and culture astound.\n\nThe Eiffel Tower soars up high,\nA symbol of your strength and pride.\nFrom Mont Saint-Michel to the Riviera's coast,\nYour landscapes captivate and boast.\n\nIn Provence, the lavender blooms,\nPerfuming the air with sweet perfumes.\nThe Louvre holds treasures untold,\nStories of the past carefully unfold.\n\nYour language, a melody so sweet,\nSpoken by lovers as they meet.\nCuisine that delights the senses,\nFrom croissants to coq au vin, the choices are endless.\n\nRevolution, art, and fashion trends,\nYour influence on the world never ends.\nFrom Monet to Chanel, you inspire,\nCreativity and beauty you never tire.\n\nOh France, with your charm and grace,\nIn every corner, there's a new embrace.\nA land of passion, a land of art,\nForever etched in every heart.", "Oh France, land of beauty and grace\nA