# Library

In [1]:
from detoxify import Detoxify

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
import scipy.stats as sts

In [4]:
# personal library
from my_detoxify import detox #, detox_loop, flag_base, flag_unbiased, clean

ModuleNotFoundError: No module named 'my_detoxify'

# Tutorial

In [None]:
# each model takes in either a string or a list of strings
results = Detoxify('original').predict('example text')

In [None]:
results = Detoxify('unbiased').predict(['example text 1','example text 2'])

In [None]:
results = Detoxify('multilingual').predict(['example text','exemple de texte','texto de ejemplo','testo di esempio','texto de exemplo','örnek metin','пример текста'])

In [None]:
# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
model = Detoxify('original', device='cuda')

In [None]:
import pandas as pd

In [None]:
results_df = pd.DataFrame(results).round(8)

In [None]:
results_df

In [None]:
#Get Toxicity Mean
results_df['toxicity'].mean()

In [None]:
#Get all column Mean
results_df.mean()

# Function

In [3]:
def detox_loop(df,model):
    detox = Detoxify(model, device='cuda')
    i = 0
    n = len(df)
    df_res = pd.DataFrame()
    while i < n:
        res = detox.predict(df[i:i+100])
        f = pd.DataFrame(res,df[i:i+100]).round(5)
        df_res = df_res.append(f)
        i = i + 100
    return df_res

In [4]:
# def flag_base(df_res,thresh):
#     [float(i) for i in thresh]
#     df_res['toxicity_flag'] = np.where(df_res['toxicity']>thresh[0],1,0)
#     df_res['severe_toxicity_flag'] = np.where(df_res['severe_toxicity']>thresh[1],1,0)
#     df_res['obscene_flag'] = np.where(df_res['obscene']>thresh[2],1,0)
#     df_res['threat_flag'] = np.where(df_res['threat']>thresh[3],1,0)
#     df_res['insult_flag'] = np.where(df_res['insult']>thresh[4],1,0)
#     df_res['identity_attack_flag'] = np.where(df_res['identity_attack']>thresh[5],1,0)
#     return df_res

In [5]:
# def flag_unbiased(df_res,thresh):
#     [float(i) for i in thresh]
#     df_res['toxicity_flag'] = np.where(df_res['toxicity']>thresh[0],1,0)
#     df_res['severe_toxicity_flag'] = np.where(df_res['severe_toxicity']>thresh[1],1,0)
#     df_res['obscene_flag'] = np.where(df_res['obscene']>thresh[2],1,0)
#     df_res['threat_flag'] = np.where(df_res['threat']>thresh[3],1,0)
#     df_res['insult_flag'] = np.where(df_res['insult']>thresh[4],1,0)
#     df_res['identity_attack_flag'] = np.where(df_res['identity_attack']>thresh[5],1,0)
#     df_res['sexual_explicit_flag'] = np.where(df_res['sexual_explicit']>thresh[6],1,0)
#     return df_res

In [6]:
def clean(df,model,thresh, print_res):
    df=list(df['body'].values.flatten())
    res = detox_loop(df=df,model=model)
    return res
    # if model == 'original':
    #     res = flag_base(df_res=res, thresh = thresh)
    # else:
    #     res = flag_unbiased(df_res=res, thresh = thresh)
    # if print_res == True:
    #     df_mean = res.mean()
    #     return res,df_mean
    # else:
    #     return res

In [20]:
def detox(source_dir,model, thresh, print_res, save_dir):
    #keeping separate pre-post list
    df = pd.read_csv(source_dir)
    df_pre = df[df['post']==0]
    df_post = df[df['post']==1]
    #pre
    if print_res == True:
        pre_res,pre_mean = clean(df=df_pre,model=model, thresh=thresh, print_res = True)
        print(f'Pre: {pre_mean}')
    else:
        pre_res = clean(df=df_pre,model=model, thresh=thresh, print_res = False)
    pre_res = pd.concat([df_pre.reset_index(drop=True),pre_res.reset_index(drop=True)], axis = 1)
    #post
    if print_res == True:
        post_res,post_mean = clean(df=df_post,model=model, thresh=thresh, print_res = True)
        print(f'Pre: {post_mean}')
    else:
        post_res = clean(df=df_post,model=model, thresh=thresh, print_res = False)
    post_res = pd.concat([df_post.reset_index(drop=True),post_res.reset_index(drop=True)], axis = 1)
    if save_dir != False:
        pre_res['post'] = 0
        post_res['post'] = 1
        res = pd.concat([pre_res,post_res], ignore_index= True)
        res.to_csv(save_dir,encoding = 'utf-8-sig')
    return pre_res,post_res

In [18]:
def detox_df(df,model, thresh, print_res, save_dir):
    #keeping separate pre-post list
    if print_res == True:
        res,mean = clean(df=df,model=model, thresh=thresh, print_res = True)
        print(f'Pre: {mean}')
    else:
        res = clean(df=df, model=model, thresh=thresh, print_res = False)
    res = pd.concat([df.reset_index(drop=True),res.reset_index(drop=True)], axis = 1)
    if save_dir != False:
        res.to_csv(save_dir,encoding = 'utf-8-sig')
    return df

In [10]:
def compare(pre_res,post_res,model):
    tox = sts.ttest_ind(pre_res['toxicity'], post_res['toxicity'])
    print(f'Toxicity: {tox}')
    sev_tox = sts.ttest_ind(pre_res['severe_toxicity'], post_res['severe_toxicity'])
    print(f'Severe Toxicity: {sev_tox}')
    obscene = sts.ttest_ind(pre_res['obscene'], post_res['obscene'])
    print(f'Obscene: {obscene}')
    threat = sts.ttest_ind(pre_res['threat'], post_res['threat'])
    print(f'Threat: {threat}')
    insult = sts.ttest_ind(pre_res['insult'], post_res['insult'])
    print(f'Insult: {insult}')
    identity = sts.ttest_ind(pre_res['identity_attack'], post_res['identity_attack'])
    print(f'Identity Attack: {identity}')
    tox_f = sts.ttest_ind(pre_res['toxicity_flag'], post_res['toxicity_flag'])
    print(f'Toxicity Flag: {tox_f}')
    sev_tox_f = sts.ttest_ind(pre_res['severe_toxicity_flag'], post_res['severe_toxicity_flag'])
    print(f'Severe Toxicity Flag: {sev_tox_f}')
    obscene_f = sts.ttest_ind(pre_res['obscene_flag'], post_res['obscene_flag'])
    print(f'Obscene Flag: {obscene_f}')
    threat_f = sts.ttest_ind(pre_res['threat_flag'], post_res['threat_flag'])
    print(f'Threat Flag: {threat_f}')
    insult_f = sts.ttest_ind(pre_res['insult'], post_res['insult'])
    print(f'Insult Flag: {insult_f}')
    identity_f = sts.ttest_ind(pre_res['identity_attack_flag'], post_res['identity_attack_flag'])
    print(f'Identity Attack Flag: {identity}')
    if model == 'unbiased':
        sexual_exp = sts.ttest_ind(pre_res['sexual_explicit'], post_res['sexual_explicit'])
        print(f'Sexual Explicit: {sexual_exp}')
        sexual_exp_f = sts.ttest_ind(pre_res['sexual_explicit_flag'], post_res['sexual_explicit_flag'])
        print(f'Sexual Explicit Flag: {sexual_exp_f}')

# Apply

## r/femaledatingstrategy as focal community

In [44]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
fds_base_pre,fds_base_post = detox(source_dir= './data/fds/fds_clean_comments.csv', model = 'original',thresh = thresh, print_res = False, save_dir =  './data/fds/fds_res.csv')

In [None]:
#compare
compare(fds_base_pre,fds_base_post, model = 'original')

In [15]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
fds_add_pre,fds_add_post = detox(source_dir= "E:/gihub-data/redditbots/fds/fds_comments_supl.csv", model = 'original',thresh = thresh, print_res = False, save_dir =  "E:/gihub-data/redditbots/fds/fds_comment_res_add.csv")

### r/Feminism as control

In [43]:
thresh = [.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
feminism_base_pre,feminism_base_post = detox(source_dir='./data/control-fds/feminism_clean.csv', model = 'original',thresh = thresh, print_res = False, save_dir = './data/control-fds/feminism_res.csv')

In [None]:
## additional for r/fds
feminism_base_pre,feminism_base_post = detox(source_dir='./data/control-fds/feminism_clean.csv', model = 'original',thresh = thresh, print_res = False, save_dir = './data/control-fds/feminism_res.csv')

### r/TwoXChromosomes as control

In [46]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
twoX_base_pre,twoX_base_post = detox(source_dir='./data/control-fds/TwoXChromosomes_clean.csv', model = 'original',thresh = thresh, print_res = False, save_dir = './data/control-fds/twoX_res.csv')

### r/WitchesVSPatriarchy as control

In [47]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
wvsp_base_pre,wvsp_base_post = detox(source_dir='./data/control-fds/wvsp_clean.csv', model = 'original',thresh = thresh, print_res = False, save_dir = './data/control-fds/wvsp_res.csv')

### r/MGTOW as control

In [48]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
mgtow_base_pre,mgtow_base_post = detox(source_dir='./data/control-fds/MGTOW_clean.csv', model = 'original',thresh = thresh, print_res = False, save_dir = './data/control-fds/MGTOW_res.csv')

### r/TheRedPill as control

In [49]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
trp_base_pre,trp_base_post = detox(source_dir='./data/control-fds/TheRedPill_clean.csv', model = 'original',thresh = thresh, print_res = False, save_dir = './data/control-fds/TheRedPill_res.csv')

### r/TrollXChromosomes as control

In [13]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
trollX_base_pre, trollX_base_post = detox(source_dir='./data/control-fds/trollX_clean.csv', model = 'original',thresh = thresh, print_res = False, save_dir = './data/control-fds/trollX_res.csv')

## r/Purple

In [10]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
purple = detox(source_dir="E:/gihub-data/redditbots/control-fds/purple.csv", model = 'original',thresh = thresh, print_res = False, save_dir = "E:/gihub-data/redditbots/control-fds/purple_res.csv")

  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.ap

## r/GenderCritical

In [11]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
gender = detox(source_dir="E:/gihub-data/redditbots/control-fds/gendercritical.csv", model = 'original',thresh = thresh, print_res = False, save_dir = "E:/gihub-data/redditbots/control-fds/gendercritical_res.csv")

  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.ap

## r/relationship_advice

In [15]:
reladvice = pd.read_csv("E:/gihub-data/redditbots/control-fds/relationshipadvice.csv")

  reladvice = pd.read_csv("E:/gihub-data/redditbots/control-fds/relationshipadvice.csv")


In [16]:
gender1= reladvice.iloc[:4000]

In [19]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
reladvice_res = detox_df(reladvice, model = 'original', thresh = thresh, print_res = False, save_dir = "E:/gihub-data/redditbots/control-fds/relationshipadvice_res.csv")

  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.ap

ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

## r/wgotw as focal community

Apr 29 2021

In [21]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
wgtow_base_pre, wgtow_base_post = detox(source_dir='./data/wgtow/wgtow_clean_comments.csv', model = 'original',thresh = thresh, print_res = False, save_dir = './data/wgtow/wgtow_res.csv')

## r/WitchesVSPatriarchy as focal community

In [23]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
wvsp_base_pre, wvsp_base_post = detox(source_dir='./data/witchesvspatriarchy/wvsp_clean_comments.csv', model = 'original',thresh = thresh, print_res = False, save_dir = './data/witchesvspatriarchy/wvps_res.csv')

## r/purplepill

In [21]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
purple_base_pre, purple_base_post = detox(source_dir="E:/gihub-data/redditbots/control-fds/purple_clean.csv", model = 'original', thresh = thresh, print_res = False, save_dir = "E:/gihub-data/redditbots/control-fds/purple_res.csv")

KeyError: 'toxicity'

In [23]:
df = pd.read_csv("E:/gihub-data/redditbots/control-fds/purple_clean.csv")
res = clean(df=df, model = 'original', thresh = thresh, print_res = True)

  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.append(f)
  df_res = df_res.ap

## Addition for

# Manual for reference

#### Before comments

In [None]:
#Import BEFORE data
fds_b4_1mo =  pd.read_csv('./data/fds_comments_before_1mo.csv')


In [None]:
#Leave out all 'removed' and 'deleted' comments
fds_b4_1mo_df = fds_b4_1mo[(fds_b4_1mo['body'] != '[deleted]') & (fds_b4_1mo['body'] != '[removed]') & (fds_b4_1mo['author'] != 'AutoModerator')]

#Flatten to list of string
fds_b4_1mo_body = list(fds_b4_1mo_df['body'].values.flatten())

#### After comments

In [None]:
#Import AFTER data
fds_aft_1mo =  pd.read_csv('./data/fds_comments_after_1mo.csv')

In [None]:
#Leave out all 'removed' and 'deleted' comments
fds_aft_1mo_df = fds_aft_1mo[(fds_aft_1mo['body'] != '[deleted]') & (fds_aft_1mo['body'] != '[removed]') & (fds_aft_1mo['author'] !='AutoModerator')]

#Flatten to list of string
fds_aft_1mo_body = list(fds_aft_1mo_df['body'].values.flatten())

## Cleaning Submission
Concerns:
- Structure of a submission: title, body (`self_text`), images/video link (`domain`)

In [None]:
#Import SUBMISSION BEFORE
fds_b4_1mo_sub = pd.read_csv('./data/fds_subm_before_1mo.csv')

In [None]:
#Number of no content submissions
fds_b4_1mo_sub['selftext'].isnull().sum()

In [None]:
#Percentage of no content submissions
fds_b4_1mo_sub['selftext'].isnull().sum()/len(fds_b4_1mo_sub['selftext'] != '[removed]')*100

In [None]:
# Condition: domain = i.reddit.it OR imugur AND selftext = ""
len(fds_b4_1mo_sub[((fds_b4_1mo_sub['domain'] == 'i.redd.it') | (fds_b4_1mo_sub['domain'] == 'i.imgur.com') |  (fds_b4_1mo_sub['domain'] == 'imgur.com'))  & fds_b4_1mo_sub['selftext'].isnull()]) 

In [None]:
# Condition: domain = i.reddit.it OR imugur. So basically they are the same.
len(fds_b4_1mo_sub[(fds_b4_1mo_sub['domain'] == 'i.redd.it') | (fds_b4_1mo_sub['domain'] == 'i.imgur.com') |  (fds_b4_1mo_sub['domain'] == 'imgur.com')]) 

In [None]:
#Number of image submissions
len(fds_b4_1mo_sub[(fds_b4_1mo_sub['domain'] == 'self.FemaleDatingStrategy')]) 

In [None]:
fds_b4_1mo_sub_df = fds_b4_1mo_sub[fds_b4_1mo_sub['selftext']]

In [None]:
# Save only submissions has written content and not deleted or removed
fds_b4_1mo_sub_test = fds_b4_1mo_sub[(fds_b4_1mo_sub['selftext'].astype(bool)) | (fds_b4_1mo_sub['selftext'] == '[removed]')]

## Set up Model - Baseline

In [None]:
#Set pre-trained model and run on GPU
originalmodel = Detoxify('original', device='cuda')

### Example

In [None]:
#Run the first batch
res = originalmodel.predict(fds_b4_1mo_body[0:100])

In [None]:
#connect with body text
pd.DataFrame(res,fds_b4_1mo_body[0:100]).round(5)

### One month

#### Before

In [None]:
i = 0
n = len(fds_b4_1mo_body)
fds_b4_1mo_body_res_base = pd.DataFrame()
while i < n:
    res = originalmodel.predict(fds_b4_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_b4_1mo_body_res_base[i:i+100]).round(5)
    fds_b4_1mo_body_res_base = pd.concat([fds_b4_1mo_body_res_base,f])
    i = i + 100

In [None]:
#Add flag to each comments
fds_b4_1mo_body_res_base['toxicity_flag'] = np.where(fds_b4_1mo_body_res_base['toxicity']>0.5,1,0)
fds_b4_1mo_body_res_base['severe_toxicity_flag'] = np.where(fds_b4_1mo_body_res_base['severe_toxicity']>0.01,1,0)
fds_b4_1mo_body_res_base['obscene_flag'] = np.where(fds_b4_1mo_body_res_base['obscene']>0.5,1,0)
fds_b4_1mo_body_res_base['threat_flag'] = np.where(fds_b4_1mo_body_res_base['threat']>0.5,1,0)
fds_b4_1mo_body_res_base['insult_flag'] = np.where(fds_b4_1mo_body_res_base['insult']>0.5,1,0)
fds_b4_1mo_body_res_base['identity_attack_flag'] = np.where(fds_b4_1mo_body_res_base['identity_attack']>0.5,1,0)
print(fds_b4_1mo_body_res_base.mean())

#### After

In [None]:
i = 0
n = len(fds_aft_1mo_body)
fds_aft_1mo_body_res_base = pd.DataFrame()
while i < n:
    res = unbiasedmodel.predict(fds_aft_1mo_body_res_base[i:i+100])
    f = pd.DataFrame(res,fds_aft_1mo_body_res_base[i:i+100]).round(5)
    fds_aft_1mo_body_res_base = pd.concat([fds_aft_1mo_body_res_base,f])
    i = i + 100

In [None]:
#Add flag to each comments
fds_aft_1mo_body_res_base['toxicity_flag'] = np.where(fds_aft_1mo_body_res_base['toxicity']>0.5,1,0)
fds_aft_1mo_body_res_base['severe_toxicity_flag'] = np.where(fds_aft_1mo_body_res_base['severe_toxicity']>0.01,1,0)
fds_aft_1mo_body_res_base['obscene_flag'] = np.where(fds_aft_1mo_body_res_base['obscene']>0.5,1,0)
fds_aft_1mo_body_res_base['threat_flag'] = np.where(fds_aft_1mo_body_res_base['threat']>0.0035,1,0)
fds_aft_1mo_body_res_base['insult_flag'] = np.where(fds_aft_1mo_body_res_base['insult']>0.5,1,0)
fds_aft_1mo_body_res_base['identity_attack_flag'] = np.where(fds_aft_1mo_body_res_base['identity_attack']>0.5,1,0)
fds_aft_1mo_body_res_base['sexual_explicit_flag'] = np.where(fds_aft_1mo_body_res_base['sexual_explicit']>0.5,1,0)
print(fds_aft_1mo_body_res_base.mean())

#### Prelim Regression

In [None]:
import scipy.stats as sts

In [None]:
fds_aft_1mo_body_res_base['post'] = 1
fds_b4_1mo_body_res_base['post'] = 0
fds = pd.concat([fds_aft_1mo_body_res_base, fds_b4_1mo_body_res_base])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['toxicity'], fds_aft_1mo_body_res_base['toxicity'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['severe_toxicity'], fds_aft_1mo_body_res_base['severe_toxicity'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['obscene'], fds_aft_1mo_body_res_base['obscene'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['threat'], fds_aft_1mo_body_res_base['threat'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['insult'], fds_aft_1mo_body_res_base['insult'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['identity_attack'], fds_aft_1mo_body_res_base['identity_attack'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['toxicity_flag'], fds_aft_1mo_body_res_base['toxicity_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['severe_toxicity_flag'], fds_aft_1mo_body_res_base['severe_toxicity_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['obscene_flag'], fds_aft_1mo_body_res_base['obscene_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['threat_flag'], fds_aft_1mo_body_res_base['threat_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['insult_flag'], fds_aft_1mo_body_res_base['insult_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['identity_attack_flag'], fds_aft_1mo_body_res_base['identity_attack_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['sexual_explicit_flag'], fds_aft_1mo_body_res_base['sexual_explicit_flag'])

## Set up Model - Unbiased Model

In [None]:
#Set pre-trained model and run o
unbiasedmodel = Detoxify('unbiased', device='cuda')

### One month

#### Before 1 month

In [None]:
i = 0
n = len(fds_b4_1mo_body)
fds_b4_1mo_body_res = pd.DataFrame()

In [None]:
while i < n:
    res = unbiasedmodel.predict(fds_b4_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_b4_1mo_body[i:i+100]).round(5)
    fds_b4_1mo_body_res = pd.concat([fds_b4_1mo_body_res,f])
    i = i + 100

In [None]:
#Add flag to each comments
fds_b4_1mo_body_res['toxicity_flag'] = np.where(fds_b4_1mo_body_res['toxicity']>0.5,1,0)
fds_b4_1mo_body_res['severe_toxicity_flag'] = np.where(fds_b4_1mo_body_res['severe_toxicity']>0.01,1,0)
fds_b4_1mo_body_res['obscene_flag'] = np.where(fds_b4_1mo_body_res['obscene']>0.5,1,0)
fds_b4_1mo_body_res['threat_flag'] = np.where(fds_b4_1mo_body_res['threat']>0.5,1,0)
fds_b4_1mo_body_res['insult_flag'] = np.where(fds_b4_1mo_body_res['insult']>0.5,1,0)
fds_b4_1mo_body_res['identity_attack_flag'] = np.where(fds_b4_1mo_body_res['identity_attack']>0.5,1,0)
fds_b4_1mo_body_res['sexual_explicit_flag'] = np.where(fds_b4_1mo_body_res['sexual_explicit']>0.5,1,0)
print(fds_b4_1mo_body_res.mean())

#### After 1 month comments

In [None]:
i = 0
n = len(fds_aft_1mo_body)
fds_aft_1mo_body_res = pd.DataFrame()
while i < n:
    res = unbiasedmodel.predict(fds_aft_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_aft_1mo_body[i:i+100]).round(5)
    fds_aft_1mo_body_res = pd.concat([fds_aft_1mo_body_res,f])
    i = i + 100

In [None]:
#Add flag to each comments
fds_aft_1mo_body_res['toxicity_flag'] = np.where(fds_aft_1mo_body_res['toxicity']>0.5,1,0)
fds_aft_1mo_body_res['severe_toxicity_flag'] = np.where(fds_aft_1mo_body_res['severe_toxicity']>0.01,1,0)
fds_aft_1mo_body_res['obscene_flag'] = np.where(fds_aft_1mo_body_res['obscene']>0.5,1,0)
fds_aft_1mo_body_res['threat_flag'] = np.where(fds_aft_1mo_body_res['threat']>0.0035,1,0)
fds_aft_1mo_body_res['insult_flag'] = np.where(fds_aft_1mo_body_res['insult']>0.5,1,0)
fds_aft_1mo_body_res['identity_attack_flag'] = np.where(fds_aft_1mo_body_res['identity_attack']>0.5,1,0)
fds_aft_1mo_body_res['sexual_explicit_flag'] = np.where(fds_aft_1mo_body_res['sexual_explicit']>0.5,1,0)
print(fds_aft_1mo_body_res.mean())

#### Score Difference

#### Prelim regression - 1 mo before and after

In [None]:
fds_aft_1mo_body_res['post'] = 1
fds_b4_1mo_body_res['post'] = 0
fds = pd.concat([fds_aft_1mo_body_res, fds_b4_1mo_body_res])

#### Score Difference

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['toxicity'], fds_b4_1mo_body_res['toxicity'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['severe_toxicity'], fds_b4_1mo_body_res['severe_toxicity'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['obscene'], fds_b4_1mo_body_res['obscene'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['threat'], fds_b4_1mo_body_res['threat'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['insult'], fds_b4_1mo_body_res['insult'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['identity_attack'], fds_b4_1mo_body_res['identity_attack'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['sexual_explicit'], fds_b4_1mo_body_res['sexual_explicit'])

#### Perc Flag Difference

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['toxicity_flag'], fds_b4_1mo_body_res['toxicity_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['severe_toxicity_flag'], fds_b4_1mo_body_res['severe_toxicity_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['obscene_flag'], fds_b4_1mo_body_res['obscene_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['threat_flag'], fds_b4_1mo_body_res['threat_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['insult_flag'], fds_b4_1mo_body_res['insult_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['identity_attack_flag'], fds_b4_1mo_body_res['identity_attack_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['sexual_explicit_flag'], fds_b4_1mo_body_res['sexual_explicit_flag'])

In [None]:
test =pd.read_csv('./data/Control/feminism_clean.csv')

In [None]:
test_body = list(test[test['post']==1]['body'].values.flatten())

In [None]:
test_extract = test[test['post']==1][0:100]

In [None]:
test_extract

In [None]:
test_res = Detoxify('unbiased',device = 'cuda').predict(test_extract['body'])

In [None]:
test_res_df = pd.DataFrame(test_res)

In [None]:
test_extract_df = pd.DataFrame(test_extract)

In [None]:
pd.concat([test_res_df, test_extract])