In [1]:
from detoxify import Detoxify

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [21]:
import numpy as np

In [95]:
import scipy.stats as sts

## Tutorial

In [11]:
# each model takes in either a string or a list of strings
results = Detoxify('original').predict('example text')

In [20]:
results = Detoxify('unbiased').predict(['example text 1','example text 2'])

In [14]:
results = Detoxify('multilingual').predict(['example text','exemple de texte','texto de ejemplo','testo di esempio','texto de exemplo','örnek metin','пример текста'])

In [13]:
# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
model = Detoxify('original', device='cuda')

In [3]:
import pandas as pd

In [4]:
results_df = pd.DataFrame(results).round(8)

In [15]:
results_df

Unnamed: 0,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
0,0.000196,0.000193,0.001263,0.000323,0.000883,0.000138,9e-05
1,0.000563,0.00481,0.030099,0.005532,0.026441,0.002134,0.001036
2,0.000628,0.003142,0.022164,0.003467,0.017627,0.001497,0.000616
3,0.000992,0.004267,0.033336,0.005408,0.028724,0.001951,0.000833
4,0.000612,0.001548,0.012159,0.001841,0.009405,0.000858,0.000362
5,0.000591,0.002832,0.021326,0.003134,0.017865,0.001327,0.000616
6,0.000535,0.003826,0.026462,0.00403,0.021902,0.001632,0.000747


In [19]:
#Get Toxicity Mean
results_df['toxicity'].mean()

0.0005879871428571428

In [20]:
#Get all column Mean
results_df.mean()

toxicity           0.000588
severe_toxicity    0.002945
obscene            0.020973
identity_attack    0.003391
insult             0.017550
threat             0.001362
sexual_explicit    0.000614
dtype: float64

# Function

In [76]:
def detox_loop(df,model):
    detox = Detoxify(model, device='cuda')
    i = 0
    n = len(df)
    df_res = pd.DataFrame()
    while i < n:
        res = detox.predict(df[i:i+100])
        f = pd.DataFrame(res,df[i:i+100]).round(5)
        df_res = pd.concat([df_res,f])
        i = i + 100
    return df_res

In [77]:
def flag_base(df_res,thresh):
    [float(i) for i in thresh]
    df_res['toxicity_flag'] = np.where(df_res['toxicity']>thresh[0],1,0)
    df_res['severe_toxicity_flag'] = np.where(df_res['severe_toxicity']>thresh[1],1,0)
    df_res['obscene_flag'] = np.where(df_res['obscene']>thresh[2],1,0)
    df_res['threat_flag'] = np.where(df_res['threat']>thresh[3],1,0)
    df_res['insult_flag'] = np.where(df_res['insult']>thresh[4],1,0)
    df_res['identity_attack_flag'] = np.where(df_res['identity_attack']>thresh[5],1,0)
    return df_res

In [78]:
def flag_unbiased(df_res,thresh):
    [float(i) for i in thresh]
    df_res['toxicity_flag'] = np.where(df_res['toxicity']>thresh[0],1,0)
    df_res['severe_toxicity_flag'] = np.where(df_res['severe_toxicity']>thresh[1],1,0)
    df_res['obscene_flag'] = np.where(df_res['obscene']>thresh[2],1,0)
    df_res['threat_flag'] = np.where(df_res['threat']>thresh[3],1,0)
    df_res['insult_flag'] = np.where(df_res['insult']>thresh[4],1,0)
    df_res['identity_attack_flag'] = np.where(df_res['identity_attack']>thresh[5],1,0)
    df_res['sexual_explicit_flag'] = np.where(df_res['sexual_explicit']>thresh[6],1,0)
    return df_res

In [79]:
def process(df,model,thresh):
    df=list(df['body'].values.flatten())
    res = detox_loop(df=df,model=model)
    if model == 'original':
        res = flag_base(df_res=res, thresh = thresh)
    else:
        res = flag_unbiased(df_res=res, thresh = thresh)
    df_mean = res.mean()
    return res,df_mean


In [149]:
def detox(source_dir,model, thresh):
    #keeping separate pre-post list
    df = pd.read_csv(source_dir)
    df_pre = df[df['post']==0]
    df_post = df[df['post']==1]
 
    #pre
    pre_res,pre_mean = process(df=df_pre,model=model, thresh=thresh)
    print(f'Pre: {pre_mean}')
    pre_res = pre_res.reset_index()
    pre_res.rename(columns ={'index':'body'})
    pre_res = pd.concat(df_pre,pre_res)

    #post
    post_res,post_mean = process(df=df_post,model=model, thresh=thresh)
    print(f'Post: {post_mean}')
    post_res = post_res.reset_index()
    post_res.rename(columns ={'index':'body'})
    post_res = pd.concat(df_post,post_res)

    return pre_res,post_res

In [112]:
def compare(pre_res,post_res,model):
    tox = sts.ttest_ind(pre_res['toxicity'], post_res['toxicity'])
    print(f'Toxicity: {tox}')

    sev_tox = sts.ttest_ind(pre_res['severe_toxicity'], post_res['severe_toxicity'])
    print(f'Severe Toxicity: {sev_tox}')

    obscene = sts.ttest_ind(pre_res['obscene'], post_res['obscene'])
    print(f'Obscene: {obscene}')

    threat = sts.ttest_ind(pre_res['threat'], post_res['threat'])
    print(f'Threat: {threat}')

    insult = sts.ttest_ind(pre_res['insult'], post_res['insult'])
    print(f'Insult: {insult}')

    identity = sts.ttest_ind(pre_res['identity_attack'], post_res['identity_attack'])
    print(f'Identity Attack: {identity}')

    tox_f = sts.ttest_ind(pre_res['toxicity_flag'], post_res['toxicity_flag'])
    print(f'Toxicity Flag: {tox_f}')

    sev_tox_f = sts.ttest_ind(pre_res['severe_toxicity_flag'], post_res['severe_toxicity_flag'])
    print(f'Severe Toxicity Flag: {sev_tox_f}')

    obscene_f = sts.ttest_ind(pre_res['obscene_flag'], post_res['obscene_flag'])
    print(f'Obscene Flag: {obscene_f}')

    threat_f = sts.ttest_ind(pre_res['threat_flag'], post_res['threat_flag'])
    print(f'Threat Flag: {threat_f}')

    insult_f = sts.ttest_ind(pre_res['insult'], post_res['insult'])
    print(f'Insult Flag: {insult_f}')

    identity_f = sts.ttest_ind(pre_res['identity_attack_flag'], post_res['identity_attack_flag'])
    print(f'Identity Attack Flag: {identity}')
    
    if model == 'unbiased':
        sexual_exp = sts.ttest_ind(pre_res['sexual_explicit'], post_res['sexual_explicit'])
        print(f'Sexual Explicit: {sexual_exp}')

        sexual_exp_f = sts.ttest_ind(pre_res['sexual_explicit_flag'], post_res['sexual_explicit_flag'])
        print(f'Sexual Explicit Flag: {sexual_exp_f}')

# Apply

## r/feminism

In [110]:
thresh = [.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
feminism_base_pre,feminism_base_post = detox(source_dir='./data/Control/feminism_clean.csv', model = 'original',thresh = thresh)

Pre: toxicity                0.179478
severe_toxicity         0.008776
obscene                 0.089124
threat                  0.004224
insult                  0.051553
identity_attack         0.018389
toxicity_flag           0.162950
severe_toxicity_flag    0.001546
obscene_flag            0.087437
threat_flag             0.002208
insult_flag             0.036653
identity_attack_flag    0.006403
dtype: float64
Post: toxicity                0.168009
severe_toxicity         0.008474
obscene                 0.084894
threat                  0.003833
insult                  0.051067
identity_attack         0.015288
toxicity_flag           0.153433
severe_toxicity_flag    0.001343
obscene_flag            0.079402
threat_flag             0.001918
insult_flag             0.036824
identity_attack_flag    0.005754
dtype: float64


In [113]:
compare(feminism_base_pre,feminism_base_post, model = 'original')

Toxicity: Ttest_indResult(statistic=1.853600213519358, pvalue=0.06382659004506512)
Severe Toxicity: Ttest_indResult(statistic=0.3407265465916189, pvalue=0.7333167999434089)
Obscene: Ttest_indResult(statistic=0.8803149343337442, pvalue=0.3787104420976326)
Threat: Ttest_indResult(statistic=0.5320321206061094, pvalue=0.5947158825402368)
Insult: Ttest_indResult(statistic=0.1527983079947721, pvalue=0.8785604968650568)
Identity Attack: Ttest_indResult(statistic=2.1791876273363764, pvalue=0.029341610547347192)
Toxicity Flag: Ttest_indResult(statistic=1.2849888515759413, pvalue=0.1988267327967718)
Severe Toxicity Flag: Ttest_indResult(statistic=0.26388040511235145, pvalue=0.7918776571224689)
Obscene Flag: Ttest_indResult(statistic=1.432761808292858, pvalue=0.15195800980732838)
Threat Flag: Ttest_indResult(statistic=0.3154951937407536, pvalue=0.7523924583489356)
Insult Flag: Ttest_indResult(statistic=0.1527983079947721, pvalue=0.8785604968650568)
Identity Attack Flag: Ttest_indResult(statistic=

In [150]:
thresh = [.5,.5,.5,.5,.5,.5,.5]
thresh = [float(i) for i in thresh]
feminism_unb_pre,feminism_unb_post = detox(source_dir='./data/Control/feminism_clean.csv', model = 'unbiased',thresh = thresh)

Pre: toxicity                0.178182
severe_toxicity         0.003853
obscene                 0.076309
identity_attack         0.032643
insult                  0.073435
threat                  0.004031
sexual_explicit         0.059691
toxicity_flag           0.159859
severe_toxicity_flag    0.000442
obscene_flag            0.078384
threat_flag             0.002208
insult_flag             0.059837
identity_attack_flag    0.014352
sexual_explicit_flag    0.052329
dtype: float64


  pre_res = pd.concat(df_pre,pre_res)


TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [116]:
compare(feminism_unb_pre,feminism_unb_post, model = 'unbiased')

Toxicity: Ttest_indResult(statistic=1.0941068948819561, pvalue=0.2739351526047687)
Severe Toxicity: Ttest_indResult(statistic=0.7168714888169526, pvalue=0.4734705832910765)
Obscene: Ttest_indResult(statistic=0.08581130396441883, pvalue=0.9316181839735099)
Threat: Ttest_indResult(statistic=0.08021507466557824, pvalue=0.9360678487879641)
Insult: Ttest_indResult(statistic=-1.6159819226597314, pvalue=0.10613060530521833)
Identity Attack: Ttest_indResult(statistic=2.8798243483258084, pvalue=0.00398764079906833)
Toxicity Flag: Ttest_indResult(statistic=0.11614791806836147, pvalue=0.9075377069099041)
Severe Toxicity Flag: Ttest_indResult(statistic=1.5175768201161466, pvalue=0.1291535654654599)
Obscene Flag: Ttest_indResult(statistic=-0.11593915209209593, pvalue=0.9077031559342021)
Threat Flag: Ttest_indResult(statistic=0.10432851489020296, pvalue=0.9169107976624281)
Insult Flag: Ttest_indResult(statistic=-1.6159819226597314, pvalue=0.10613060530521833)
Identity Attack Flag: Ttest_indResult(st

In [130]:
# This part needs to be in func
test = pd.read_csv('./data/Control/feminism_clean.csv')

In [131]:
test = test[test['post']==0]

In [134]:
test_body=list(test['body'].values.flatten())[1:100]

In [138]:
test_res = Detoxify('unbiased',device = 'cuda').predict(test_body)

In [139]:
test_res = pd.DataFrame(test_res,test_body)

In [140]:
test_res

Unnamed: 0,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
“buT MeN chOoSe tO wOrK thOsE jERbs”,0.146783,0.000125,0.023427,0.006523,0.031431,0.000591,0.003193
Are you my excuse to deem feminists anything other than hot-headed attention seekers lacking in self awareness?,0.106481,0.000040,0.001348,0.036017,0.024539,0.000339,0.003825
Then do it. \n\n“Everyone I’m leaving!”\n\n“Bye”,0.001507,0.000002,0.000111,0.000120,0.000194,0.000062,0.000029
"Wait ...let me get this straight. There are actually people who believe rape, gun violence, gang activity, and terrorism are strictly male issues? Is this satire? I can't comprehend how anyone would be dumb enough to buy this ideology. Ill take my ban now, thanks.",0.664137,0.000070,0.002260,0.003002,0.526946,0.000362,0.008923
And?,0.000583,0.000001,0.000023,0.000123,0.000129,0.000020,0.000009
...,...,...,...,...,...,...,...
"I'd like to refer you to rules 1,2 and 3 of this subreddit. One of them is it's forbidden to promote regressive agenda.",0.000379,0.000001,0.000024,0.000072,0.000097,0.000018,0.000013
"Again, BDSM and Kinks are not same thing and you don't have the guts to either understand or answer the question. \n\nWhat's the most comedic thing is you people don't even understand that equating rape with kinks and BDSM, you people actually are performing kink shaming. And then there are matters of women who got criticised for being a sexual dominatrix by people like you. People like you who don't know nothing about BDSM, kinks or fetishes dominate online spaces where they use their ill informed notions to shame victims, women and practitioners of kinks themselves when they speak out against rape in real life or porn. ""Oh, you hate rape porn, you must be a kink shamer""!!!",0.748611,0.002017,0.069258,0.053675,0.178638,0.003149,0.632282
I'd like to refer you to rules 4 and 5 of this subreddit.,0.000366,0.000001,0.000028,0.000065,0.000092,0.000019,0.000013
belly breaths! 5 second inhale... 5 second exhale...,0.001771,0.000003,0.000113,0.000209,0.000443,0.000034,0.000102


In [151]:
feminism_base_post = feminism_base_post.reset_index()['index']

In [152]:
feminism_base_post.pd.rename(columns ={'index':'body'})

TypeError: Series.rename() got an unexpected keyword argument 'columns'

# Analyze FDS

#### Before comments

In [5]:
#Import BEFORE data
fds_b4_1mo =  pd.read_csv('./data/fds_comments_before_1mo.csv')


In [20]:
#Leave out all 'removed' and 'deleted' comments
fds_b4_1mo_df = fds_b4_1mo[(fds_b4_1mo['body'] != '[deleted]') & (fds_b4_1mo['body'] != '[removed]') & (fds_b4_1mo['author'] != 'AutoModerator')]

#Flatten to list of string
fds_b4_1mo_body = list(fds_b4_1mo_df['body'].values.flatten())

#### After comments

In [7]:
#Import AFTER data
fds_aft_1mo =  pd.read_csv('./data/fds_comments_after_1mo.csv')

In [42]:
#Leave out all 'removed' and 'deleted' comments
fds_aft_1mo_df = fds_aft_1mo[(fds_aft_1mo['body'] != '[deleted]') & (fds_aft_1mo['body'] != '[removed]') & (fds_aft_1mo['author'] !='AutoModerator')]

#Flatten to list of string
fds_aft_1mo_body = list(fds_aft_1mo_df['body'].values.flatten())

## Cleaning Submission
Concerns:
- Structure of a submission: title, body (`self_text`), images/video link (`domain`)

In [35]:
#Import SUBMISSION BEFORE
fds_b4_1mo_sub = pd.read_csv('./data/fds_subm_before_1mo.csv')

In [13]:
#Number of no content submissions
fds_b4_1mo_sub['selftext'].isnull().sum()

1869

In [36]:
#Percentage of no content submissions
fds_b4_1mo_sub['selftext'].isnull().sum()/len(fds_b4_1mo_sub['selftext'] != '[removed]')*100

46.760070052539405

In [42]:
# Condition: domain = i.reddit.it OR imugur AND selftext = ""
len(fds_b4_1mo_sub[((fds_b4_1mo_sub['domain'] == 'i.redd.it') | (fds_b4_1mo_sub['domain'] == 'i.imgur.com') |  (fds_b4_1mo_sub['domain'] == 'imgur.com'))  & fds_b4_1mo_sub['selftext'].isnull()]) 

1136

In [41]:
# Condition: domain = i.reddit.it OR imugur. So basically they are the same.
len(fds_b4_1mo_sub[(fds_b4_1mo_sub['domain'] == 'i.redd.it') | (fds_b4_1mo_sub['domain'] == 'i.imgur.com') |  (fds_b4_1mo_sub['domain'] == 'imgur.com')]) 

1141

In [33]:
#Number of image submissions
len(fds_b4_1mo_sub[(fds_b4_1mo_sub['domain'] == 'self.FemaleDatingStrategy')]) 

2185

In [None]:
fds_b4_1mo_sub_df = fds_b4_1mo_sub[fds_b4_1mo_sub['selftext']]

In [34]:
# Save only submissions has written content and not deleted or removed
fds_b4_1mo_sub_test = fds_b4_1mo_sub[(fds_b4_1mo_sub['selftext'].astype(bool)) | (fds_b4_1mo_sub['selftext'] == '[removed]')]

## Set up Model - Baseline

In [None]:
#Set pre-trained model and run on GPU
originalmodel = Detoxify('original', device='cuda')

### Example

In [None]:
#Run the first batch
res = originalmodel.predict(fds_b4_1mo_body[0:100])

In [None]:
#connect with body text
pd.DataFrame(res,fds_b4_1mo_body[0:100]).round(5)

### One month

#### Before

In [None]:
i = 0
n = len(fds_b4_1mo_body)
fds_b4_1mo_body_res_base = pd.DataFrame()
while i < n:
    res = originalmodel.predict(fds_b4_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_b4_1mo_body_res_base[i:i+100]).round(5)
    fds_b4_1mo_body_res_base = pd.concat([fds_b4_1mo_body_res_base,f])
    i = i + 100

In [None]:
#Add flag to each comments
fds_b4_1mo_body_res_base['toxicity_flag'] = np.where(fds_b4_1mo_body_res_base['toxicity']>0.5,1,0)
fds_b4_1mo_body_res_base['severe_toxicity_flag'] = np.where(fds_b4_1mo_body_res_base['severe_toxicity']>0.01,1,0)
fds_b4_1mo_body_res_base['obscene_flag'] = np.where(fds_b4_1mo_body_res_base['obscene']>0.5,1,0)
fds_b4_1mo_body_res_base['threat_flag'] = np.where(fds_b4_1mo_body_res_base['threat']>0.5,1,0)
fds_b4_1mo_body_res_base['insult_flag'] = np.where(fds_b4_1mo_body_res_base['insult']>0.5,1,0)
fds_b4_1mo_body_res_base['identity_attack_flag'] = np.where(fds_b4_1mo_body_res_base['identity_attack']>0.5,1,0)
print(fds_b4_1mo_body_res_base.mean())

#### After

In [None]:
i = 0
n = len(fds_aft_1mo_body)
fds_aft_1mo_body_res_base = pd.DataFrame()
while i < n:
    res = unbiasedmodel.predict(fds_aft_1mo_body_res_base[i:i+100])
    f = pd.DataFrame(res,fds_aft_1mo_body_res_base[i:i+100]).round(5)
    fds_aft_1mo_body_res_base = pd.concat([fds_aft_1mo_body_res_base,f])
    i = i + 100

In [None]:
#Add flag to each comments
fds_aft_1mo_body_res_base['toxicity_flag'] = np.where(fds_aft_1mo_body_res_base['toxicity']>0.5,1,0)
fds_aft_1mo_body_res_base['severe_toxicity_flag'] = np.where(fds_aft_1mo_body_res_base['severe_toxicity']>0.01,1,0)
fds_aft_1mo_body_res_base['obscene_flag'] = np.where(fds_aft_1mo_body_res_base['obscene']>0.5,1,0)
fds_aft_1mo_body_res_base['threat_flag'] = np.where(fds_aft_1mo_body_res_base['threat']>0.0035,1,0)
fds_aft_1mo_body_res_base['insult_flag'] = np.where(fds_aft_1mo_body_res_base['insult']>0.5,1,0)
fds_aft_1mo_body_res_base['identity_attack_flag'] = np.where(fds_aft_1mo_body_res_base['identity_attack']>0.5,1,0)
fds_aft_1mo_body_res_base['sexual_explicit_flag'] = np.where(fds_aft_1mo_body_res_base['sexual_explicit']>0.5,1,0)
print(fds_aft_1mo_body_res_base.mean())

#### Prelim Regression

In [None]:
import scipy.stats as sts

In [None]:
fds_aft_1mo_body_res_base['post'] = 1
fds_b4_1mo_body_res_base['post'] = 0
fds = pd.concat([fds_aft_1mo_body_res_base, fds_b4_1mo_body_res_base])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['toxicity'], fds_aft_1mo_body_res_base['toxicity'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['severe_toxicity'], fds_aft_1mo_body_res_base['severe_toxicity'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['obscene'], fds_aft_1mo_body_res_base['obscene'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['threat'], fds_aft_1mo_body_res_base['threat'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['insult'], fds_aft_1mo_body_res_base['insult'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['identity_attack'], fds_aft_1mo_body_res_base['identity_attack'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['toxicity_flag'], fds_aft_1mo_body_res_base['toxicity_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['severe_toxicity_flag'], fds_aft_1mo_body_res_base['severe_toxicity_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['obscene_flag'], fds_aft_1mo_body_res_base['obscene_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['threat_flag'], fds_aft_1mo_body_res_base['threat_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['insult_flag'], fds_aft_1mo_body_res_base['insult_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['identity_attack_flag'], fds_aft_1mo_body_res_base['identity_attack_flag'])

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res_base['sexual_explicit_flag'], fds_aft_1mo_body_res_base['sexual_explicit_flag'])

## Set up Model - Unbiased Model

In [76]:
#Set pre-trained model and run o
unbiasedmodel = Detoxify('unbiased', device='cuda')

### One month

#### Before 1 month

In [81]:
i = 0
n = len(fds_b4_1mo_body)
fds_b4_1mo_body_res = pd.DataFrame()

In [82]:
while i < n:
    res = unbiasedmodel.predict(fds_b4_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_b4_1mo_body[i:i+100]).round(5)
    fds_b4_1mo_body_res = pd.concat([fds_b4_1mo_body_res,f])
    i = i + 100

In [86]:
#Add flag to each comments
fds_b4_1mo_body_res['toxicity_flag'] = np.where(fds_b4_1mo_body_res['toxicity']>0.5,1,0)
fds_b4_1mo_body_res['severe_toxicity_flag'] = np.where(fds_b4_1mo_body_res['severe_toxicity']>0.01,1,0)
fds_b4_1mo_body_res['obscene_flag'] = np.where(fds_b4_1mo_body_res['obscene']>0.5,1,0)
fds_b4_1mo_body_res['threat_flag'] = np.where(fds_b4_1mo_body_res['threat']>0.5,1,0)
fds_b4_1mo_body_res['insult_flag'] = np.where(fds_b4_1mo_body_res['insult']>0.5,1,0)
fds_b4_1mo_body_res['identity_attack_flag'] = np.where(fds_b4_1mo_body_res['identity_attack']>0.5,1,0)
fds_b4_1mo_body_res['sexual_explicit_flag'] = np.where(fds_b4_1mo_body_res['sexual_explicit']>0.5,1,0)
print(fds_b4_1mo_body_res.mean())

toxicity                0.239328
severe_toxicity         0.005412
obscene                 0.133117
identity_attack         0.016930
insult                  0.118515
threat                  0.003724
sexual_explicit         0.078196
toxicity_flag           0.234986
severe_toxicity_flag    0.089187
obscene_flag            0.141755
threat_flag             0.002022
insult_flag             0.101393
identity_attack_flag    0.005017
sexual_explicit_flag    0.070990
dtype: float64


#### After 1 month comments

In [43]:
i = 0
n = len(fds_aft_1mo_body)
fds_aft_1mo_body_res = pd.DataFrame()
while i < n:
    res = unbiasedmodel.predict(fds_aft_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_aft_1mo_body[i:i+100]).round(5)
    fds_aft_1mo_body_res = pd.concat([fds_aft_1mo_body_res,f])
    i = i + 100

In [60]:
#Add flag to each comments
fds_aft_1mo_body_res['toxicity_flag'] = np.where(fds_aft_1mo_body_res['toxicity']>0.5,1,0)
fds_aft_1mo_body_res['severe_toxicity_flag'] = np.where(fds_aft_1mo_body_res['severe_toxicity']>0.01,1,0)
fds_aft_1mo_body_res['obscene_flag'] = np.where(fds_aft_1mo_body_res['obscene']>0.5,1,0)
fds_aft_1mo_body_res['threat_flag'] = np.where(fds_aft_1mo_body_res['threat']>0.0035,1,0)
fds_aft_1mo_body_res['insult_flag'] = np.where(fds_aft_1mo_body_res['insult']>0.5,1,0)
fds_aft_1mo_body_res['identity_attack_flag'] = np.where(fds_aft_1mo_body_res['identity_attack']>0.5,1,0)
fds_aft_1mo_body_res['sexual_explicit_flag'] = np.where(fds_aft_1mo_body_res['sexual_explicit']>0.5,1,0)
print(fds_aft_1mo_body_res.mean())

toxicity                0.233624
severe_toxicity         0.004703
obscene                 0.131320
identity_attack         0.013421
insult                  0.110362
threat                  0.004406
sexual_explicit         0.078861
toxicity_flag           0.230385
severe_toxicity_flag    0.085282
obscene_flag            0.137574
threat_flag             0.074049
insult_flag             0.094226
identity_attack_flag    0.003203
post                    1.000000
sexual_explicit_flag    0.072635
dtype: float64


#### Score Difference

#### Prelim regression - 1 mo before and after

In [45]:
fds_aft_1mo_body_res['post'] = 1
fds_b4_1mo_body_res['post'] = 0
fds = pd.concat([fds_aft_1mo_body_res, fds_b4_1mo_body_res])

#### Score Difference

In [None]:
sts.ttest_ind(fds_aft_1mo_body_res['toxicity'], fds_b4_1mo_body_res['toxicity'])

In [64]:
sts.ttest_ind(fds_aft_1mo_body_res['severe_toxicity'], fds_b4_1mo_body_res['severe_toxicity'])

Ttest_indResult(statistic=-1.2859843700151743, pvalue=0.1984564801701852)

In [65]:
sts.ttest_ind(fds_aft_1mo_body_res['obscene'], fds_b4_1mo_body_res['obscene'])

Ttest_indResult(statistic=-1.119669542293619, pvalue=0.2628618011816403)

In [53]:
sts.ttest_ind(fds_aft_1mo_body_res['threat'], fds_b4_1mo_body_res['threat'])

Ttest_indResult(statistic=1.603121251103093, pvalue=0.10891633381862119)

In [178]:
sts.ttest_ind(fds_aft_1mo_body_res['insult'], fds_b4_1mo_body_res['insult'])

Ttest_indResult(statistic=-4.466846087339321, pvalue=7.959826618625523e-06)

In [179]:
sts.ttest_ind(fds_aft_1mo_body_res['identity_attack'], fds_b4_1mo_body_res['identity_attack'])

Ttest_indResult(statistic=-5.756451450590391, pvalue=8.652672465096693e-09)

In [73]:
sts.ttest_ind(fds_aft_1mo_body_res['sexual_explicit'], fds_b4_1mo_body_res['sexual_explicit'])

Ttest_indResult(statistic=0.29539620657489857, pvalue=0.7676928710610675)

#### Perc Flag Difference

In [74]:
sts.ttest_ind(fds_aft_1mo_body_res['toxicity_flag'], fds_b4_1mo_body_res['toxicity_flag'])

Ttest_indResult(statistic=-1.0097791287103937, pvalue=0.3126076562298086)

In [75]:
sts.ttest_ind(fds_aft_1mo_body_res['severe_toxicity_flag'], fds_b4_1mo_body_res['severe_toxicity_flag'])

Ttest_indResult(statistic=-1.2859843700151743, pvalue=0.1984564801701852)

In [68]:
sts.ttest_ind(fds_aft_1mo_body_res['obscene_flag'], fds_b4_1mo_body_res['obscene_flag'])

Ttest_indResult(statistic=-1.119669542293619, pvalue=0.2628618011816403)

In [69]:
sts.ttest_ind(fds_aft_1mo_body_res['threat_flag'], fds_b4_1mo_body_res['threat_flag'])

Ttest_indResult(statistic=31.529471453723872, pvalue=2.293974688841594e-215)

In [70]:
sts.ttest_ind(fds_aft_1mo_body_res['insult_flag'], fds_b4_1mo_body_res['insult_flag'])

Ttest_indResult(statistic=-2.2460639116165053, pvalue=0.02470572853072864)

In [71]:
sts.ttest_ind(fds_aft_1mo_body_res['identity_attack_flag'], fds_b4_1mo_body_res['identity_attack_flag'])

Ttest_indResult(statistic=-2.713755782402491, pvalue=0.0066555668333369185)

In [72]:
sts.ttest_ind(fds_aft_1mo_body_res['sexual_explicit_flag'], fds_b4_1mo_body_res['sexual_explicit_flag'])

Ttest_indResult(statistic=0.5894638190458906, pvalue=0.5555537378316449)

In [71]:
test =pd.read_csv('./data/Control/feminism_clean.csv')

In [88]:
list(test[test['post']==1]['body'].values.flatten())

['The comments on the OP are very disheartening.',
 "I was joking. They have a colossal advantage. Objectively.\n\nI'm just highlighting the hypocrisy.",
 "There are a few different examples, but here's one that I can actually name off the top of my head. I know a few people that are far left feminists that are asking for more money due to being women and suffering through a wage gap that has been debunked a good few times.",
 'wow so cute!',
 'Theres only two genders!',
 "&gt; always have virgin tight vagina.\n\nVaginas aren't deformed by penises.",
 '"Put light against light - you have nothing. Put dark against dark - you have nothing. It\'s the contrast of light and dark that each give the other one meaning." - Bob Ross',
 'I lost all respect for her when she praised Kavanaugh and Gorsuch. She said they were both "smart and very decent men".\n\nETA source in main comment: https://www.google.com/amp/s/amp.cnn.com/cnn/2019/07/26/politics/ruth-bader-ginsburg-kavanaugh-gorsuch/index.htm