# Generating Chat GPT Predictions

## Imports and Constants

In [1]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import openai
import os
import pandas as pd
import time

In [3]:
# TODO: import these from token.json when we move this notebook to our github code

openai.organization = "REDACTED"
openai.api_key = "REDACTED"

In [4]:
TRAIN_FILE = "full_train.csv"
TEST_FILE = "full_test.csv"

In [68]:
ZERO_LABEL_KEYWORD = "real"
ONE_LABEL_KEYWORD = "fake"

In [5]:
MAXIMUM_NUM_CHAT_GPT_MESSAGES = 2048 # maximum number of messages
NUM_REQUIRED_CHAT_GPT_MESSAGES = 2 # number of structuring messages we must include to Chat-GPT


MAX_TRAIN_ROWS = (MAXIMUM_NUM_CHAT_GPT_MESSAGES - NUM_REQUIRED_CHAT_GPT_MESSAGES) // 150

## Mounting Google Drive

In [6]:
GOOGLE_DRIVE_MOUNT_PATH_PREFIX = '/content/drive'
MY_CS152_DATA_FILE_PATH = "drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data/" # NOTE: you have to modify this to fit wherever the CS152 Group Project/Milestone 3/Code/Data is in your Google Drive

In [7]:
from google.colab import drive
drive.mount(GOOGLE_DRIVE_MOUNT_PATH_PREFIX)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
cd $MY_CS152_DATA_FILE_PATH

/content/drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data


## Reading in data

In [9]:
train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)

In [10]:
print(len(train_df))
print(len(test_df))

8531
2133


## Transforming data for input to Chat GPT

In [11]:
# reference structure
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a content moderation system. Classify each input as either threatening or not-threatening."},
{"role": "user", "content": "I am going to kill you."},
{"role": "assistant", "content": "Threatening"},
{"role": "user", "content": "I love you"},
{"role": "assistant", "content": "Not-threatening"},
{"role": "user", "content": "You're a great"}]
)

output = response['choices'][0]['message']['content']
print(output)

Not-threatening


In [45]:
messages = [{"role": "system", "content": "You are a content moderation system. Classify input as either 'real' or 'fake'. Do not use more than one word."}]

### Training Data Prompts

In [46]:
for index, row in train_df.head(MAX_TRAIN_ROWS).iterrows():
  messages.append({"role": "user", "content": f"{row['text']}"})
  messages.append({"role": "assistant", "content": f"{row['label']}"})

In [47]:
print(len(messages))

27


### Generate Test Predictions

In [48]:
# we now only pass through one at a time
# messages.append({"role": "user", "content": "Now generate labels for each of the following messages."})

In [49]:
idx_to_prediction = {}

In [57]:
while len(idx_to_prediction) != len(test_df):
  print(len(idx_to_prediction))

  try:
    for index, row in test_df.iterrows():
      if index in idx_to_prediction:
        continue
      elif index == 1996:
        continue
      new_messages = messages[:]
      new_messages.append({"role": "user", "content": f"{row['text']}"})  

      response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=new_messages
      )
      
      idx_to_prediction[index] = response['choices'][0]['message']['content']
  except Exception as e:
    print(e)
    time.sleep(1)

1996
<empty message>
1996
<empty message>
1996
<empty message>
1996
<empty message>
1996
<empty message>
1996
<empty message>
1996
<empty message>


KeyboardInterrupt: ignored

In [58]:
print(len(idx_to_prediction))

1996


In [61]:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=new_messages
)

AuthenticationError: ignored

## Parse Output

In [32]:
import json

# with open('gpt_preds_2.json', 'w') as fp:
#     json.dump(idx_to_prediction, fp)

In [42]:
old_preds = json.loads("gpt_preds.json")

JSONDecodeError: ignored

In [62]:
# get the set of predictions created by gpt
output_results = set(idx_to_prediction.values())
print(len(output_results))
for output in list(output_results)[:10]:
  print(output)

70
It is recommended to consult with a medical professional to determine an appropriate treatment plan based on individual health factors and the severity of COVID-19 symptoms.
This input is unclear and seems to be incomplete. Can you please provide more context so that I can classify it accurately?
I'm sorry but the statement is not making complete sense and it's difficult to determine if it is real or fake. Can you please clarify or provide more context?
It's a controversial statement and the concept of herd immunity has not been conclusively established for COVID-19. Most experts agree that the best way to slow the transmission of the virus is through measures like social distancing, mask-wearing, and widespread testing and contact tracing.
The information about 32 people in a single bus being diagnosed with Hantavirus in China is real, but it has no relation with COVID-19. Hantavirus is a type of virus found in rats and mice that can cause a deadly disease called Hantavirus Pulmona

In [63]:
def clean_pred(pred):
  cleaned = pred.lower()
  cleaned = pred.strip()
  cleaned = ''.join([i for i in cleaned if i.isalpha()])
  return cleaned

In [64]:
idx_to_cleaned_predict = {idx: clean_pred(pred) for idx, pred in idx_to_prediction.items()}

In [65]:
print(idx_to_cleaned_predict)

{0: 'real', 1: 'real', 2: 'fake', 3: 'real', 4: 'fake', 5: 'real', 6: 'real', 7: 'real', 8: 'real', 9: 'real', 10: 'real', 11: 'fake', 12: 'real', 13: 'real', 14: 'real', 15: 'real', 16: 'real', 17: 'fake', 18: 'real', 19: 'real', 20: 'real', 21: 'real', 22: 'real', 23: 'fake', 24: 'fake', 25: 'real', 26: 'fake', 27: 'fake', 28: 'fake', 29: 'real', 30: 'real', 31: 'real', 32: 'real', 33: 'real', 34: 'fake', 35: 'fake', 36: 'real', 37: 'real', 38: 'real', 39: 'real', 40: 'fake', 41: 'real', 42: 'fake', 43: 'real', 44: 'fake', 45: 'real', 46: 'fake', 47: 'real', 48: 'real', 49: 'real', 50: 'fake', 51: 'fake', 52: 'real', 53: 'real', 54: 'real', 55: 'real', 56: 'fake', 57: 'real', 58: 'real', 59: 'real', 60: 'fake', 61: 'Real', 62: 'real', 63: 'real', 64: 'fake', 65: 'real', 66: 'real', 67: 'fake', 68: 'real', 69: 'real', 70: 'real', 71: 'fake', 72: 'real', 73: 'real', 74: 'real', 75: 'real', 76: 'real', 77: 'fake', 78: 'real', 79: 'real', 80: 'real', 81: 'real', 82: 'real', 83: 'real', 8

In [92]:
idx_to_binary_label = {}

for idx, pred in idx_to_cleaned_predict.items():
  if pred == ZERO_LABEL_KEYWORD:
    idx_to_binary_label[idx] = 0
  elif pred == ONE_LABEL_KEYWORD:
    idx_to_binary_label[idx] = 1
  else:
    idx_to_binary_label[idx] = 0.5 # ambiguous response

In [93]:
np_test_labels = test_df[["label"]].to_numpy().flatten()

In [94]:
print(np_test_labels.shape)

(2133,)


In [95]:
full_np_preds = np.full(np_test_labels.shape, None)

In [96]:
for idx, label in idx_to_binary_label.items():
  full_np_preds[idx] = label

In [97]:
from collections import Counter

print(Counter(full_np_preds))

Counter({0: 1462, 1: 462, None: 137, 0.5: 72})


In [98]:
full_np_preds[full_np_preds == None] = -1

In [99]:
from collections import Counter

print(Counter(full_np_preds))

Counter({0: 1462, 1: 462, -1: 137, 0.5: 72})


In [100]:
np.savetxt("gpt_preds.csv", full_np_preds, delimiter=",")