In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# get environment variable: OPENAI_API_KEY
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [3]:
import os
import openai
from retry import retry

openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
from datasets import load_dataset

dataset = load_dataset("sst2")

Found cached dataset sst2 (/home/survival/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
trn_data = dataset["train"]
val_data = dataset["validation"]

In [6]:
# sentimentid2sentimentlabel = {"0": "negative", "1": "positive"}
sents_trn, labels_trn = [], []
for idx in range(len(trn_data)):
    sents_trn.append(trn_data[idx]["sentence"])
    label_trn = trn_data[idx]["label"]
    if label_trn == 0:
        labels_trn.append('negative')
    else:
        labels_trn.append('positive')

In [7]:
# sentimentid2sentimentlabel = {"0": "negative", "1": "positive"}
sents, labels = [], []
for idx in range(len(val_data)):
    sents.append(val_data[idx]["sentence"])
    label = val_data[idx]["label"]
    if label == 0:
        labels.append('negative')
    else:
        labels.append('positive')

In [8]:
import pandas as pd

results = pd.DataFrame()
results["sentence"] = sents_trn
results["label"] = labels_trn

results_positive = results[results["label"] == "positive"]
results_negative = results[results["label"] == "negative"]

In [9]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

In [10]:
# run it only the first time
# embeddings_positive = embedder.embed_documents(list(results_positive["sentence"]))
# embeddings_negative = embedder.embed_documents(list(results_negative["sentence"]))
# df_embeddings_positive = pd.DataFrame(embeddings_positive)
# df_embeddings_negative = pd.DataFrame(embeddings_negative)
# df_embeddings_positive.to_pickle("./outputs/openai_embeddings_positive.pkl")
# df_embeddings_negative.to_pickle("./outputs/openai_embeddings_negative.pkl")

In [11]:
df_embeddings_positive = pd.read_pickle("./outputs/openai_embeddings_positive.pkl")
df_embeddings_negative = pd.read_pickle("./outputs/openai_embeddings_negative.pkl")

In [12]:
embeddings_positive = [list(df_embeddings_positive.iloc[i]) for i in range(df_embeddings_positive.shape[0])]
embeddings_negative = [list(df_embeddings_negative.iloc[i]) for i in range(df_embeddings_negative.shape[0])]

In [13]:
f = len(embeddings_positive[0])

In [14]:
from annoy import AnnoyIndex

t_positive = AnnoyIndex(f, 'angular')
for i in range(len(embeddings_positive)):
    t_positive.add_item(i, embeddings_positive[i])

In [15]:
t_negative = AnnoyIndex(f, 'angular')
for i in range(len(embeddings_negative)):
    t_negative.add_item(i, embeddings_negative[i])

In [16]:
t_positive.build(1000) # 1000 trees
t_negative.build(1000) # 1000 trees

True

In [17]:
t_positive.save('t_positive.ann')
t_negative.save('t_negative.ann')

True

In [18]:
u_positive = AnnoyIndex(f, 'angular')
u_positive.load('t_positive.ann')
u_negative = AnnoyIndex(f, 'angular')
u_negative.load('t_negative.ann')

True

In [20]:
@retry(tries=10, delay=3)
def my_classifier(text: str, n_examples: int) -> str:
    query_embedding = embedder.embed_query(text)
    neighbors_positive = u_positive.get_nns_by_vector(query_embedding,
                                                      n_examples)
    neighbors_negative = u_negative.get_nns_by_vector(query_embedding,
                                                      n_examples)
    template = f"Your task is to determine what is the sentiment conveyed by a text. Here are some examples:\n\n"
    for i in range(n_examples):
        template += f"Text: '{results_positive.iloc[neighbors_positive[i]]['sentence']}'\nThe sentiment conveyed is: {results_positive.iloc[neighbors_positive[i]]['label']}\n\n"
        template += f"Text: '{results_negative.iloc[neighbors_negative[i]]['sentence']}'\nThe sentiment conveyed is: {results_negative.iloc[neighbors_negative[i]]['label']}\n\n"
    template += f"Given the following text, what is the sentiment conveyed?:\n\nText: '{text}'\nThe sentiment conveyed is:"
#     return template
    return openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=[{
            'role':
            'system',
            'content':
            'You are a helpful sentiment analyzer assistant of movie reviews'
        }, {
            'role': 'user',
            'content': f'{template}'
        }],
        logit_bias={
            '31587': 100,  # 31587 is the token for `positive`
            '43324': 100  # 43324 is the token for `negative`
        },
        max_tokens=1,
        temperature=0).choices[0].message.content

In [21]:
# i = 0
# sents[i], labels[i], my_classifier(sents[i], n_examples=2)

In [22]:
import time
import numpy as np

In [23]:
n_examples = 3

In [24]:
start_time = time.time()
preds = [my_classifier(sent, n_examples) for sent in sents]
end_time = time.time()

In [25]:
end_time - start_time

1194.4010379314423

In [26]:
np.mean([preds[i] == labels[i] for i in range(len(labels))])

0.9552752293577982

In [27]:
partial = pd.DataFrame()
partial["sentence"] = sents
partial["label"] = labels
partial["preds"] = preds

In [28]:
mistakes = partial[partial["label"] != partial["preds"]]

In [29]:
mistakes.to_csv("./outputs/mistakes_3_neighbors_openai_embeddings.csv", index=False)