In [48]:
from openai import OpenAI
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [8]:
# check if the openai api key is set in the environment
assert 'OPENAI_API_KEY' in os.environ.keys()
client = OpenAI()

In [45]:
# batched queries to chatGPT moderation API
def modAPI(text=["Sample text goes here.", "Sample text again goes here."]):
    response = client.moderations.create(input=text)
    outputs = response.results
    moderation_results = []
    for output in outputs:
        flagged = output.flagged
        categories_flagged = output.categories
        scores = output.category_scores
        moderation_results += [(flagged, categories_flagged, scores)]
    return moderation_results

In [21]:
# read the csv data
far_left = pd.read_csv('./data/clean_combined_far-left.csv')[['body', 'flagged']]
far_right = pd.read_csv('./data/clean_combined_far-right.csv')[['body', 'flagged']]
left = pd.read_csv('./data/clean_combined_left.csv')[['body', 'flagged']]
right = pd.read_csv('./data/clean_combined_right.csv')[['body', 'flagged']]

In [22]:
far_right.head()

Unnamed: 0,body,flagged
0,This is a good tactic. People are moved in dif...,1
1,Mussolini was seen as a great leader by all th...,1
2,[Multiculturalism for Israel!](,1
3,Mussolini was seen as a great leader by all th...,1
4,&gt; indoctrination coming from the guy who w...,1


In [23]:
# gather total flagged results
left_flagged = pd.concat([far_left[far_left['flagged'] == 1], left[left['flagged'] == 1]])
right_flagged = pd.concat([far_right[far_right['flagged'] == 1], right[right['flagged'] == 1]])
left_flagged.to_csv('./data/left_flagged.csv')
right_flagged.to_csv('./data/right_flagged.csv')

In [59]:
def modRun(df):
    moderations = []
    for chunk in tqdm(chunker(df,10)):
        moderations += modAPI(chunk['body'].tolist())
    return moderations

In [64]:
def postprocessing(df, moderations):
    df['moderation'] = moderations
    df[['flagged_run', 'categories', 'scores']] = pd.DataFrame(df['moderation'].tolist(), index=df.index)
    df['categories'] = df['categories'].apply(lambda x: dict(x))
    df['scores'] = df['scores'].apply(lambda x: dict(x))
    return df

In [38]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
# left_flagged_moderations = modRun(left_flagged)
right_flagged_moderations = modRun(right_flagged)

In [49]:
# left_flagged_moderations = []
# for chunk in tqdm(chunker(left_flagged,10)):
#     left_flagged_moderations += modAPI(chunk['body'].tolist())

3163it [1:00:36,  1.15s/it]


In [67]:
with open('./data/left_flagged_moderated.pkl', 'wb') as handle:
    pickle.dump(left_flagged_moderations, handle)

postprocessing(left_flagged, left_flagged_moderations).to_csv('./data/left_flagged_moderated.csv')