In [1]:
import pandas as pd
import jsonlines as jsonl
from sklearn.metrics import classification_report

from openai import OpenAI
import openai
import os

import HelperChan
import GPTHelper


We start by acquiring and formatting our data to be used for fine tuning.

In [2]:
# Read in labeled posts
labeled_posts = pd.concat(
    [
        pd.read_csv('../data/first_pass_labels.csv', index_col='Unnamed: 0'),
        pd.read_csv('../data/lgbt_week_2_classified.csv', index_col='Unnamed: 0'),
        pd.read_csv('../data/lgbt_week_3_classified.csv', index_col='Unnamed: 0')
    ],
    ignore_index=True
).drop_duplicates('id')

# Read in all posts to allow for backref
all_posts = pd.concat(
    [
        pd.read_csv('../data/lgbt_week_1.csv'), pd.read_csv('../data/lgbt_week_2.csv'), pd.read_csv('../data/lgbt_week_3.csv')
    ], ignore_index=True
).drop_duplicates('id')

# Backref!!
labeled_posts['backref'] = labeled_posts['content'].apply(lambda x: HelperChan.content_with_back_reference(str(x), all_posts))

# Recommended lower bound for significant performance improvement according to OpenAI
train = labeled_posts.sample(n=50)
test = labeled_posts.drop(index=train.index)

messages = GPTHelper.create_example_messages(train, content_col='backref')

messages[:5]



[{'messages': [{'role': 'system',
    'content': 'You are a classifier which reads the raw content of posts from 4chan\'s /lgbt/ board, and says whether or not they discuss trans people or trans-related topics. If the post is about trans people, respond "Yes". Otherwise, respond "No".'},
   {'role': 'user',
    'content': '\n        Does the following message discuss trans people or trans-related topics? Yes or No.\n        \n        MESSAGE: <ref>">>34590225You have gender dysphoria, which transgenders have. You literally told a friend who affirmed you and then you got sad because you aren\'t a girl (yet).You want a womb and to not be seen as male. You think having a male body is gross. You feel bad being seen as a man.You are trans. This this is literally textbook levels type shit."</ref>\n:(\n        '},
   {'role': 'assistant', 'content': 'Yes'}]},
 {'messages': [{'role': 'system',
    'content': 'You are a classifier which reads the raw content of posts from 4chan\'s /lgbt/ board,

In [3]:
# Write our messages to a jsonl
with jsonl.open('../data/fine_tuning/output.jsonl', mode='w') as writer:
    for message in messages:
        writer.write(message)

In [4]:
openai.api_key = os.environ['OPENAI_API_KEY']

client = OpenAI(
    api_key=openai.api_key.strip()
)

# client.files.create(
#     file=open('../data/fine_tuning/output.jsonl', 'rb'),
#     purpose='fine-tune'
# )

The following code chunk was used to generate the fine tuning job. However, I *really* don't want to risk re-running the job, so it's commented out.

In [5]:
# client.fine_tuning.jobs.create(
#      training_file='file-gT3eiC36253G1jdgP8b2I4GM',
#      model='gpt-3.5-turbo-0125',
#      hyperparameters={
#          'n_epochs':3
#      }
#)

SyntaxError: unmatched ')' (4206112675.py, line 7)

We now get both base GPT and tuned GPT to label our posts.

In [6]:
def create_prompt_messages(posts, content_col='content', system_role=None):
    '''
    Given a dataframe of 4chan posts with content in their content_col,
    creates prompt messages for the posts.

    Inputs:
      posts (DataFrame): posts
      content_col (str): name of the column containing content. Defaults to 'content'
      system_role (str): name of the system_role, uses a prompt for classifiyn g4chan posts if None. Defautls None.

    Returns: list of prompt messages for GPT. 
    '''

    messages = []
    system_role = 'You are a classifier which reads the raw content of posts from 4chan\'s /lgbt/ board, and says whether or not they discuss trans people or trans-related topics. If the post is about trans people, respond "Yes". Otherwise, respond "No".'    
    for post in posts[content_col]:

        user_prompt = \
        f'''
        Does the following message discuss trans people or trans-related topics? Yes or No.
        
        MESSAGE: {post}
        '''

        message = \
            [
                {'role': 'system', 'content':system_role},
                {'role': 'user', 'content':user_prompt}
            ]
        
        messages.append(message)
    
    return messages

def call_gpt_on_prompts(client, prompts, model='gpt-3.5-turbo-0125'):
    '''
    Call GPT on a list of prompts and return a list of responses.
    
    Inputs:
      client (OpenAI): the client to call
      prompts (list): list of prompts
      model (str): list of dictionary prompts.

    Returns: list of completion messages
    '''
    responses = []
    for message in prompts:
        response = client.chat.completions.create(
            model=model,
            messages=message
        )
        responses.append(response.choices[0].message.content)
    
    return responses

In [None]:
prompts = create_prompt_messages(test, content_col='backref')

#untuned_resps = call_gpt_on_prompts(client, prompts)

Save our work in case we're cursed

In [None]:
test['untuned'] = untuned_resps

#test.to_csv('../data/test_with_untuned.csv')

In [None]:
#tuned_resps = call_gpt_on_prompts(client, prompts, model='ft:gpt-3.5-turbo-0125:personal::8zCxjHNl')

In [None]:
#test['tuned'] = tuned_resps

#test.to_csv('../data/test_with_both.csv')

In [None]:
test['untuned'] = test['untuned'].apply(lambda x: 1 if x=='Yes' else 0)
test['tuned'] = test['tuned'].apply(lambda x: 1 if x=='Yes' else 0)

In [None]:
test.head()

Unnamed: 0,subject,id,author,date,time,content,clean_content,refs,urls,classification,backref,untuned,tuned
0,post suifuel,34577209,Anonymous,2/7/2024,19:35:44,">>34571853If they stop after the first one, th...","\r\nIf they stop after the first one, they loo...",['>>34571853'],[],1,<ref>'How do these plastic surgery addicts loo...,1,1
1,34604558,34605860,)*Kassandra of Ellaphae|PSO2,2/10/2024,2:34:13,>>34605630thought someone might like that :3i ...,\r\nthought someone might like that :3i dont n...,['>>34605630'],[],0,<ref>'>>34605584that’s pretty hot actually'</r...,0,0
2,34606802,34611390,Anonymous,02/10/24,15:09:03,>>34606802>Is being a transbian bad?yesthey're...,\r\n>Is being a transbian bad?yesthey're all d...,['>>34606802'],[],1,"<ref>""Is being a transbian bad? Whenever I tel...",1,1
3,/chasergen/ black hole edition,34593140,Anonymous,2/9/2024,2:23:20,why do deranged transbians not stick to their ...,why do deranged transbians not stick to their ...,[],[],1,why do deranged transbians not stick to their ...,1,1
4,/mtfg/ huh?,34557268,"A.G.P. pilot ""naz"" Nullifier(...)",2/6/2024,1:44:31,heathers really resonates with mehttps://youtu...,heathers really resonates with mehttps://youtu...,[],['/watch?v=LIHPNqYiPSMalmost'],0,heathers really resonates with mehttps://youtu...,0,0


In [None]:
tuned_accuracy = (sum(test['tuned'] == test['classification'])) / len(test)
untuned_accuracy = (sum(test['untuned'] == test['classification'])) / len(test)

tuned_accuracy, untuned_accuracy

(0.88, 0.8622222222222222)

In [None]:
print(classification_report(test['classification'], test['tuned'], labels=[0,1]))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89       244
           1       0.90      0.83      0.86       206

    accuracy                           0.88       450
   macro avg       0.88      0.88      0.88       450
weighted avg       0.88      0.88      0.88       450



In [None]:
print(classification_report(test['classification'], test['untuned'], labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88       244
           1       0.89      0.80      0.84       206

    accuracy                           0.86       450
   macro avg       0.87      0.86      0.86       450
weighted avg       0.86      0.86      0.86       450



That is.. notably better than a supervised model performs. 

Guess that means I'm using GPT!

In [10]:
import warnings
from tqdm.auto import tqdm
tqdm.pandas()

with warnings.catch_warnings(action="ignore"):
    all_posts['backref'] = all_posts['content'].progress_apply(
        lambda x: HelperChan.content_with_back_reference(str(x), labeled_posts))

to_label = all_posts.sample(n=4250).reset_index(drop=True)

prompts = create_prompt_messages(to_label, content_col='backref')

100%|██████████| 186635/186635 [00:36<00:00, 5151.90it/s]


In [11]:
resps = call_gpt_on_prompts(client, prompts[:5], model='gpt-3.5-turbo-0125')

In [12]:
resps = call_gpt_on_prompts(client, prompts, model='gpt-3.5-turbo-0125')

KeyboardInterrupt: 

In [None]:
to_label['gpt_classification'] = resps
to_label.to_csv('../data/half_corpus_classified.csv', index=False)