# OpenAI's GPT 3.5

In [20]:
import csv
import time
from ast import literal_eval
import numpy as np
import openai
import pandas as pd
from tqdm.notebook import trange
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import tiktoken

In [21]:
path_train = 'data/sts-train.csv'
path_dev = 'data/sts-dev.csv'
path_test = 'data/sts-test.csv'

In [22]:
columns=['genre', 'file', 'year', 'index', 'score', 'sentence1', 'sentence2']

In [23]:
df_train = pd.read_csv(path_train, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_dev = pd.read_csv(path_dev, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')
df_test = pd.read_csv(path_test, sep='\t', usecols=range(7), header=None, quoting=csv.QUOTE_NONE, names=columns, encoding='UTF-8')

# Pre-processing

In [24]:
def pre_processing(df: pd.DataFrame) -> pd.DataFrame:
    df['genre'] = df['genre'].replace('main-', '', regex=True)
    df['genre'] = df['genre'].replace('forum', 'forums')
    df['year'] = df['year'].replace(r'\D', '', regex=True)
    df['score'] = MinMaxScaler().fit_transform(df[['score']])
    return df

In [25]:
df_train = pre_processing(df_train)
df_dev = pre_processing(df_dev)
df_test = pre_processing(df_test)
df_train_dev = pd.concat([df_train, df_dev]).reset_index(drop=True)

In [26]:
def evaluate(scores: np.ndarray, amount=len(df_test)) -> None:
    actual_scores = df_test['score'][0:amount].to_numpy()
    print(f"R2: {r2_score(actual_scores, scores):.5f}")
    print(f"MAE: {mean_absolute_error(actual_scores, scores):.5f}")
    print(f"RMSE: {mean_squared_error(actual_scores, scores, squared=False):.5f}")

# Asking ChatGPT

In [27]:
openai.api_key  = ""

In [28]:
system_content = """
Your task is to compute the similarity of multiple pair of sentences.
The similarity is a real number that takes range from 0 to 5.
"""

In [29]:
score_guidelines = """
To determine the score, follow the guidelines delimited by triple backticks. Note that they are not labels, but guidelines. You can answer with any score as long as it is a real number and takes range from 0.0 to 5.0.
```
- Score 5.0: the two sentences are completely equivalent, as they mean the same thing.
- Score 4.0: the two sentences are mostly equivalent, but some unimportant details differ.
- Score 3.0: the two sentences are roughly equivalent, but some important information differs/missing.
- Score 2.0: the two sentences are not equivalent, but share some details.
- Score 1.0: the two sentences are not equivalent, but are on the same topic.
- Score 0.0: the the two sentences are completely dissimilar.
```
"""

In [30]:
input_format = """
You will receive the multiple pair of sentences in the following format, delimited by triple backticks:
```
[Pair 1]
- "First sentence of the first pair."
- "Second sentence of the first pair."

[Pair 2]
- "First sentence of the second pair."
- "Second sentence of the second pair."

[Pair 3]
- "First sentence of the third pair."
- "Second sentence of the third pair."
```
"""

In [31]:
few_shots_examples = """
The following are few-shot sample pairs of sentences and their similarity score:

Example 1:
- "A cat is playing a piano."
- "A man is playing a guitar."
Score: 0.6

Example 2:
- "Runners race around a track."
- "Runners compete in a race."
Score: 3.2

Example 3:
- "A person is slicing a tomato."
- "A person is slicing some meat."
Score: 1.75
"""

In [32]:
output_format = """
Your answer must contain a a list of scores that has one element for each pair received. The first element of the list must be the similarity score of the first pair of sentences, the second element must be the similarity score of the second pair of sentences, and so on and so forth.

An example of an answer is delimited in triple backticks below:
```[ 2.7, 3.2, 1.3 ]```

"""

In [33]:
def concat_sentences(start: int, amount: int = 15) -> str:
    message = "The following are the sentences you need to compute the similarity score for:"
    for i, j in enumerate(range(start, start + amount)):
        message += f"""
[Pair {i + 1}]
- "{df_test['sentence1'][j]}"
- "{df_test['sentence2'][j]}"
"""
    return message

GPT-3.5 can handle at most 4096 tokens in a single request.
We need to use a tokenizer to make sure that the prompt does not exceed the limit.

In [34]:
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [35]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

In [36]:
def perform_request(sentences: str) -> list[float]:
    # Concatenate all the strings inside message dictionary
    prompt = system_content + score_guidelines + input_format + few_shots_examples + output_format + sentences

    # Check the number of tokens in the message
    if  len(tokenizer.encode(prompt)) > 3900:
        raise Exception("The number of tokens in the message exceeds the limit.")

    # Perform the request and return the response
    response = get_completion(prompt)
    return literal_eval(response)

In [37]:
pred_scores = list()

In [50]:
start_sentence = 0
n_sentences = 1365
sentences_per_request = 15

for i in trange(start_sentence, n_sentences, sentences_per_request):
    message = concat_sentences(i, amount=sentences_per_request)
    response = perform_request(message)
    print(f"Similarity of sentences from {i + 1} to {i + sentences_per_request}: {response}")
    if len(response) != sentences_per_request:
        raise Exception("The number of scores returned is not equal to the number of sentences.")
    pred_scores = pred_scores + response
    time.sleep(5)

  0%|          | 0/13 [00:00<?, ?it/s]

Similarity of sentences from 1171 to 1185: [3.5, 4.5, 3.5, 4.5, 1.0, 1.0, 4.5, 2.5, 4.0, 0.5, 1.0, 4.5, 2.0, 2.5, 1.0]
Similarity of sentences from 1186 to 1200: [4.0, 4.5, 4.0, 3.5, 4.5, 3.5, 4.0, 2.5, 2.0, 1.5, 1.0, 4.5, 4.5, 3.5, 4.0]
Similarity of sentences from 1201 to 1215: [3.5, 2.5, 4.5, 4.5, 4.8, 4.0, 5.0, 3.0, 1.5, 2.8, 3.5, 5.0, 4.5, 2.0, 4.5]
Similarity of sentences from 1216 to 1230: [4.5, 2.5, 3.5, 3.8, 2.2, 2.5, 3.8, 3.5, 3.2, 2.8, 3.5, 4.5, 4.5, 4.5, 3.5]
Similarity of sentences from 1231 to 1245: [4.0, 3.5, 3.5, 4.2, 2.5, 3.8, 3.0, 2.0, 0.5, 1.5, 2.0, 2.8, 3.5, 2.5, 1.0]
Similarity of sentences from 1246 to 1260: [4.5, 4.2, 1.2, 0.8, 4.5, 4.5, 4.0, 4.5, 1.5, 0.5, 4.8, 2.5, 4.8, 3.5, 4.5]
Similarity of sentences from 1261 to 1275: [3.5, 2.8, 4.5, 1.2, 3.8, 3.5, 4.5, 4.0, 1.0, 1.5, 3.5, 4.0, 2.0, 4.5, 4.2]
Similarity of sentences from 1276 to 1290: [2.5, 1.8, 4.5, 1.2, 4.2, 1.0, 1.5, 2.5, 4.0, 1.5, 4.5, 4.5, 1.5, 1.0, 2.5]
Similarity of sentences from 1291 to 1305: [4.5,

In [51]:
len(response), len(pred_scores)

(15, 1365)

In [52]:
evaluate(np.array(pred_scores) / 5, amount=len(pred_scores))

R2: 0.50762
MAE: 0.16446
RMSE: 0.21347
