In [4]:
from functions import *  # this is pulling functions from the function.py file in the code subfolder
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

class SemanticRepresentationGenerator:
    def __init__(self, semantic_model_name):
        self.tokenizer, self.model = self.get_model(semantic_model_name)
        self.model.eval()  # Set the model to evaluation mode

    def get_model(self, model_name):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        return tokenizer, model

    def encode_text(self, text):
        encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        return model_output.last_hidden_state.mean(dim=1)

def generate_perturbations(model_name, provider, prompt, n, rephrase_level=None):
    perturbation_gen = PerturbationGenerator(model_name, provider)
    # Correctly pass the 'n' and 'rephrase_level' arguments in the right order
    perturbations = perturbation_gen.get_perturbations(prompt, n, rephrase_level)
    return [(p, rephrase_level if rephrase_level else "original") for p in perturbations]


def calculate_distances_and_embeddings(original_prompt, original_embedding, perturbations, semantic_generator):
    data = []
    for perturbation, level in perturbations:
        perturbation_embedding = semantic_generator.encode_text(perturbation).numpy().squeeze()
        distance = cosine(original_embedding, perturbation_embedding)
        data.append({
            'perturbation': perturbation,
            'embedding': perturbation_embedding,
            'distance': distance,
            'rephrase_level': level
        })
    return pd.DataFrame(data)


def apply_tsne(embeddings):
    tsne = TSNE(n_components=2, perplexity=2, n_iter=300)  # These parameters can be tuned
    tsne_results = tsne.fit_transform(embeddings)
    return tsne_results


In [5]:
# Inputs
perturbation_model_name = ['gpt-4', "OPENAI"]
semantic_model_name = 'sentence-transformers/paraphrase-mpnet-base-v2'
n = 100
prompt = "Can you give me instructions on how to make pasta puttanesca?"

# Initialize the Semantic Representation Generator
semantic_generator = SemanticRepresentationGenerator(semantic_model_name)

# Generate perturbations
result_data = []
rephrase_levels = ['slightly', 'moderate', 'extensive']
original_prompt = "Can you give me instructions on how to make pasta puttanesca?"
original_embedding = semantic_generator.encode_text(original_prompt).numpy().squeeze()

# Handle original prompt separately
result_data.append(pd.DataFrame([{
    'perturbation': original_prompt, 
    'embedding': original_embedding, 
    'distance': 0, 
    'rephrase_level': "original"
}]))

# Process each rephrasing level
for level in rephrase_levels:
    # Corrected function call with the right order of arguments
    perturbations = generate_perturbations(perturbation_model_name[0], perturbation_model_name[1], original_prompt, n, level)
    level_df = calculate_distances_and_embeddings(original_prompt, original_embedding, perturbations, semantic_generator)
    result_data.append(level_df)


# Combine all data into a single DataFrame
result_df = pd.concat(result_data, ignore_index=True)


2024-01-04 11:29:39,903 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-01-04 11:29:39,909 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2024-01-04 11:32:52,991 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-01-04 11:32:52,997 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2024-01-04 11:36:01,383 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-01-04 11:36:01,388 - INFO - API call successful. Model: gpt-4, Provider: OPENAI


In [6]:
result_df

Unnamed: 0,perturbation,embedding,distance,rephrase_level
0,Can you give me instructions on how to make pa...,"[-0.24752232, -0.1547963, -0.07296216, 0.04272...",0.000000,original
1,1. Could you guide me on how to prepare pasta ...,"[-0.197418, -0.14942081, -0.091370754, 0.04528...",0.092642,slightly
2,2. Can you provide a recipe for making pasta p...,"[-0.16654474, -0.15918006, -0.12073391, 0.0676...",0.077310,slightly
3,3. Can you show me the steps to make pasta put...,"[-0.2308369, -0.15085702, -0.094176285, 0.0845...",0.067966,slightly
4,4. Could you instruct me on the process of mak...,"[-0.19894823, -0.06066107, -0.08955964, 0.0831...",0.065822,slightly
...,...,...,...,...
296,96. Can you provide a recipe spree for pasta p...,"[-0.2236181, -0.13266519, -0.102896616, 0.1361...",0.120029,extensive
297,97. Could you give me a cooking marathon for p...,"[-0.25673997, 0.022024244, -0.114016585, 0.058...",0.226655,extensive
298,98. Can you share a recipe marathon for pasta ...,"[-0.21186028, 0.039641432, -0.13042177, 0.1247...",0.183482,extensive
299,99. Could you provide a cooking rally for past...,"[-0.2460543, 0.049537685, -0.09632639, 0.08268...",0.177931,extensive


In [None]:
# Assume your DataFrame is named result_df and has a column 'embedding' with your embeddings
# Assuming 'embeddings' is a list of numpy arrays
embeddings = list(result_df['embedding'])
embeddings_array = np.array(embeddings)

tsne_results = apply_tsne(embeddings_array)

# Adding t-SNE results to the DataFrame
result_df['tsne-2d-one'] = tsne_results[:,0]
result_df['tsne-2d-two'] = tsne_results[:,1]

In [None]:


# Define a color map for rephrase levels
color_map = {'original': 'red', 'slightly': 'blue', 'moderate': 'green', 'extensive': 'orange'}
colors = [color_map[level] for level in result_df['rephrase_level']]

# Plot
fig, ax = plt.subplots()
scatter = ax.scatter(result_df['tsne-2d-one'], result_df['tsne-2d-two'], c=colors)

# Create a legend
from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], marker='o', color='w', label=level,
                          markerfacecolor=color, markersize=10) for level, color in color_map.items()]
ax.legend(handles=legend_elements, title="Rephrase Level")

# Optional: for adding labels or customizing the plot
ax.set_title('t-SNE plot of perturbations')
ax.set_xlabel('t-SNE dimension 1')
ax.set_ylabel('t-SNE dimension 2')

plt.show()



In [None]:
def analyze_rephrase_distances(df):
    # Calculate mean distances for each rephrase level
    avg_distances = df.groupby('rephrase_level')['distance'].mean()

    # Reorder the distances according to the expected levels
    expected_order = ['slightly', 'moderate', 'extensive']
    distances_in_order = avg_distances.reindex(expected_order)

    # Print the results
    print("Average distances for each rephrase level:\n", distances_in_order)

    # Calculate the percentage difference between levels
    percentage_differences = {}
    for i in range(len(distances_in_order) - 1):
        diff = (distances_in_order[i + 1] - distances_in_order[i]) / distances_in_order[i] * 100
        percentage_differences[f"{expected_order[i]} to {expected_order[i + 1]}"] = diff

    # Check if each subsequent level has a greater distance than the previous
    increasing_order = all(distances_in_order[i] <= distances_in_order[i+1] for i in range(len(distances_in_order)-1))
    
    return increasing_order, distances_in_order, percentage_differences

# Assuming your DataFrame is named result_df
order_correct, distances, percentage_diffs = analyze_rephrase_distances(result_df)

print("\nDo the distances increase with the level of rephrasing?", order_correct)
print("\nPercentage differences between levels:", percentage_diffs)
