In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# get environment variable: OPENAI_API_KEY
from dotenv import load_dotenv
load_dotenv() 

True

In [3]:
import tiktoken
import openai

In [4]:
# find tokens for 'positive' and 'negative'
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
enc.encode("positive"), enc.encode("negative") 

([31587], [43324])

In [5]:
# download dataset from hf datasets https://github.com/huggingface/datasets
# !pip install datasets 
from datasets import load_dataset

dataset = load_dataset("sst2")

Found cached dataset sst2 (/home/survival/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
val_data = dataset["validation"]
val_data

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 872
})

In [7]:
# sentimentid2sentimentlabel = {"0": "negative", "1": "positive"}
sents, labels = [], []
for idx in range(len(val_data)):
    sents.append(val_data[idx]["sentence"])
    label = val_data[idx]["label"]
    if label == 0:
        labels.append('negative')
    else:
        labels.append('positive')

In [8]:
sent = sents[5]
label = labels[5]
sent, label

('although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . ',
 'positive')

In [9]:
repr(sent)

"'although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . '"

In [10]:
enc.encode(sent)

[37241,
 326,
 4535,
 449,
 28485,
 323,
 264,
 2478,
 81697,
 5092,
 29727,
 1174,
 279,
 4632,
 374,
 264,
 10625,
 11559,
 6129,
 1427,
 520,
 3995,
 3278,
 662,
 220]

In [11]:
def my_classifier(text: str) -> str:
    return openai.ChatCompletion.create(
    model='gpt-3.5-turbo',
    messages=[
        {
            'role': 'system',
            'content': 'Given this text, what is the sentiment conveyed? Is it positive or negative?'
        },
        {
        'role':'user',
        'content':text
    }],
    logit_bias={
        '31587': 100,    # 31587 is the token for `positive`
        '43324': 100     # 43324 is the token for `negative`
    },
    max_tokens=1,
    temperature=0
).choices[0].message.content

In [12]:
my_classifier(sent)

'positive'

In [20]:
label

'positive'

In [14]:
import time

In [15]:
preds = []
for i, sent in enumerate(sents[:100]):
    try:
        preds.append(my_classifier(sent))
    except:
        preds.append(np.nan)
        time.sleep(5)

In [16]:
len(preds), len(labels)

(100, 872)

In [17]:
import pandas as pd

In [18]:
missing = [i for i, pred in enumerate(preds) if pd.isna(pred)]
missing

[]

In [19]:
import numpy as np

np.mean([labels[i] == preds[i] for i in range(len(preds))])

0.94

In [21]:
results = pd.DataFrame()
results["sentence"] = sents[:100]
results["label"] = labels[:100]
results["predictions"] = preds

In [22]:
results.head(10)

Unnamed: 0,sentence,label,predictions
0,it 's a charming and often affecting journey .,positive,positive
1,unflinchingly bleak and desperate,negative,negative
2,allows us to hope that nolan is poised to emba...,positive,positive
3,"the acting , costumes , music , cinematography...",positive,positive
4,"it 's slow -- very , very slow .",negative,negative
5,although laced with humor and a few fanciful t...,positive,positive
6,a sometimes tedious film .,negative,negative
7,or doing last year 's taxes with your ex-wife .,negative,negative
8,you do n't have to know about music to appreci...,positive,positive
9,"in exactly 89 minutes , most of which passed a...",negative,negative


In [24]:
results.to_csv("./outputs/SST2_dataset_results.csv", index=False)