In [150]:
from openai import OpenAI
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [151]:
# check if the openai api key is set in the environment
assert 'OPENAI_API_KEY' in os.environ.keys()
client = OpenAI()

In [152]:
# batched queries to chatGPT moderation API
def modAPI(model, text=["Sample text goes here.", "Sample text again goes here."]):
    response = client.moderations.create(input=text, model=model)
    outputs = response.results
    moderation_results = []
    for output in outputs:
        flagged = output.flagged
        categories_flagged = output.categories
        scores = output.category_scores
        moderation_results += [(flagged, categories_flagged, scores)]
    return moderation_results

In [153]:
# read the csv data
far_left = pd.read_csv('./data/clean_combined_far-left.csv')[['body', 'flagged']]
far_right = pd.read_csv('./data/clean_combined_far-right.csv')[['body', 'flagged']]
left = pd.read_csv('./data/clean_combined_left.csv')[['body', 'flagged']]
right = pd.read_csv('./data/clean_combined_right.csv')[['body', 'flagged']]

In [154]:
far_right.head()

Unnamed: 0,body,flagged
0,This is a good tactic. People are moved in dif...,1
1,Mussolini was seen as a great leader by all th...,1
2,[Multiculturalism for Israel!](,1
3,Mussolini was seen as a great leader by all th...,1
4,&gt; indoctrination coming from the guy who w...,1


In [155]:
# gather total flagged results
left_flagged = pd.concat([far_left, left])
right_flagged = pd.concat([far_right, right])
# left_flagged.to_csv('./data/left_flagged.csv')
# right_flagged.to_csv('./data/right_flagged.csv')

In [156]:
def modRun(df, model='text-moderation-stable'):
    moderations = []
    for chunk in tqdm(chunker(df,10)):
        moderations += modAPI(model=model, text=chunk['body'].tolist())
    return moderations

In [157]:
def postprocessing(df, moderations):
    df['moderation'] = moderations
    df[['flagged_run', 'categories', 'scores']] = pd.DataFrame(df['moderation'].tolist(), index=df.index)
    df['categories'] = df['categories'].apply(lambda x: dict(x))
    df['scores'] = df['scores'].apply(lambda x: dict(x))
    return df

In [158]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [163]:
left_flagged_moderations = modRun(df=left_flagged, model='text-moderation-stable')
# right_flagged_moderations = modRun(df=right_flagged, model='text-moderation-stable')

5729it [1:09:24,  1.38it/s]


In [166]:
with open('./results/left_flagged_moderated_stable.pkl', 'wb') as handle:
    pickle.dump(left_flagged_moderations, handle)

postprocessing(left_flagged, left_flagged_moderations).to_csv('./results/left_flagged_stable.csv')

In [165]:
# # reading and processing manually annotated ideology data
# annotated = pd.read_csv('./data/validation_ideology_classifier_explicit_subreddits_comments.csv')
# print(len(annotated))
# del annotated['prediction']
# del annotated['confidence']
# annotated.columns = ['text', 'label_id', 'subreddit']
# # drop rows not have an annotation
# # print(pd.unique(annotated['label_id']))
# annotated = annotated.loc[(annotated['label_id']  == '0') | (annotated['label_id']  == '1') | (annotated['label_id']  == '2')]
# label_map = {0: 'left', 1: 'right', 2: 'balanced'}
# annotated['label'] = annotated['label_id'].apply(lambda x: label_map[int(x)])
# annotated.to_csv('./data/processed_annotated_comments.csv')