In [1]:
from detoxify import Detoxify

  from .autonotebook import tqdm as notebook_tqdm


## Tutorial

In [11]:
# each model takes in either a string or a list of strings
results = Detoxify('original').predict('example text')

In [20]:
results = Detoxify('unbiased').predict(['example text 1','example text 2'])

In [14]:
results = Detoxify('multilingual').predict(['example text','exemple de texte','texto de ejemplo','testo di esempio','texto de exemplo','örnek metin','пример текста'])

In [13]:
# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
model = Detoxify('original', device='cuda')

In [3]:
import pandas as pd

In [4]:
results_df = pd.DataFrame(results).round(8)

In [15]:
results_df

Unnamed: 0,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
0,0.000196,0.000193,0.001263,0.000323,0.000883,0.000138,9e-05
1,0.000563,0.00481,0.030099,0.005532,0.026441,0.002134,0.001036
2,0.000628,0.003142,0.022164,0.003467,0.017627,0.001497,0.000616
3,0.000992,0.004267,0.033336,0.005408,0.028724,0.001951,0.000833
4,0.000612,0.001548,0.012159,0.001841,0.009405,0.000858,0.000362
5,0.000591,0.002832,0.021326,0.003134,0.017865,0.001327,0.000616
6,0.000535,0.003826,0.026462,0.00403,0.021902,0.001632,0.000747


In [19]:
#Get Toxicity Mean
results_df['toxicity'].mean()

0.0005879871428571428

In [20]:
#Get all column Mean
results_df.mean()

toxicity           0.000588
severe_toxicity    0.002945
obscene            0.020973
identity_attack    0.003391
insult             0.017550
threat             0.001362
sexual_explicit    0.000614
dtype: float64

## Analyze FDS

In [204]:
#Import BEFORE data
fds_b4_1mo =  pd.read_csv('./data/fds_comments_before_1mo.csv')

#Number of deleted and remove
len(fds_b4_1mo[(fds_b4_1mo['body'] == '[deleted]') | (fds_b4_1mo['body'] == '[removed]')])/len(fds_b4_1mo)*100

8.734121215951463

In [201]:
#Leave out all 'removed' and 'deleted' comments
fds_b4_1mo_body = fds_b4_1mo[(fds_b4_1mo_['body'] != '[deleted]') & (fds_b4_1mo['body'] != '[removed]')]

#Flatten to list of string
fds_b4_1mo_body = list(fds_b4_1mo['body'].values.flatten())

In [205]:
#Import AFTER data
fds_aft_1mo =  pd.read_csv('./data/fds_comments_after_1mo.csv')

#Number of deleted and remove
len(fds_aft_1mo[(fds_aft_1mo['body'] == '[deleted]') | (fds_aft_1mo['body'] == '[removed]')])/len(fds_aft_1mo)*100

20.9742686806907

In [None]:
#Leave out all 'removed' and 'deleted' comments
fds_aft_1mo_body = fds_aft_1mo[(fds_aft_1mo['body'] != '[deleted]') & (fds_aft_1mo['body'] != '[removed]')]

#Flatten to list of string
fds_aft_1mo_body = list(fds_aft_1mo['body'].values.flatten())

### Number of deleted commments - Other months

In [164]:
#Import BEFORE data - Jul
fds_jul =  pd.read_csv('./data/fds_comments_before_4mo.csv')

#Number of deleted and remove
len(fds_jul[(fds_jul['body'] == '[deleted]') | (fds_jul['body'] == '[removed]')])/len(fds_jul)*100

0.0

In [165]:
#Import BEFORE data - Aug
fds_aug =  pd.read_csv('./data/fds_comments_before_3mo.csv')

#Number of deleted and remove
len(fds_aug[(fds_aug['body'] == '[deleted]') | (fds_aug['body'] == '[removed]')])/len(fds_aug)*100

0.42087542087542085

In [167]:
#Import BEFORE data - Sep
fds_sep =  pd.read_csv('./data/fds_comments_before_2mo.csv')

#Number of deleted and remove
len(fds_sep[(fds_sep['body'] == '[deleted]') | (fds_sep['body'] == '[removed]')])/len(fds_sep)*100

3.892270798745619

In [169]:
#Import AFTER data - Dec
fds_dec =  pd.read_csv('./data/fds_comments_after_2mo.csv')

#Number of deleted and remove
len(fds_dec[(fds_dec['body'] == '[deleted]') | (fds_dec['body'] == '[removed]')])/len(fds_dec)*100

15.543538546866333

In [170]:
#Import AFTER data - Jan
fds_jan =  pd.read_csv('./data/fds_comments_after_3mo.csv')

#Number of deleted and remove
len(fds_jan[(fds_jan['body'] == '[deleted]') | (fds_jan['body'] == '[removed]')])/len(fds_jan)*100

12.443613605884504

### Set upmodel

In [67]:
#Set pre-trained model and run on GPU
originalmodel = Detoxify('original', device='cuda')

### Example

In [59]:
#Run the first batch
res = originalmodel.predict(fds_b4_1mo_body[0:100])

In [60]:
#connect with body text
pd.DataFrame(res,fds_b4_1mo_body[0:100]).round(5)

Unnamed: 0,toxicity,severe_toxicity,obscene,threat,insult,identity_attack
fuking loser\r\n\r\n&amp;#x200B;\r\n\r\nNew rule: hold out for someone who can spell fuck.,0.99798,0.33718,0.98986,0.00390,0.91408,0.01373
"This. It’s ridiculous how they encourage a certain behavior and then try to punish the woman for it. Sometimes I wish I liked women, because men make me sick at times 😂",0.14266,0.00034,0.00317,0.00064,0.00253,0.00204
Have six cats and one man. Can confirm.,0.00071,0.00011,0.00017,0.00012,0.00019,0.00014
"Really? That's it I live with him so I dont get a say anymore? When people are unfulfilled in relationships ' just put up with it ' just leads to resentment and unhappiness on both sides. been there done that, not again.\r\n\r\nNo third way? Like maybe I should just take him out more? Though we'd end up doing what I want all the time so it's not a great solution. That's why i'm asking",0.00349,0.00010,0.00031,0.00012,0.00025,0.00017
"I think you already know the answer, although you stated you aren’t unhappy in the title, you seem unsatisfied according to your post. Why do you have to wait until things get better? Don’t waste your time. Meet another man who can fulfil your needs.",0.04644,0.00014,0.00077,0.00073,0.00157,0.00047
...,...,...,...,...,...,...
Amazing post! I need to reread it more. I agree. Thank you for sharing.,0.00052,0.00014,0.00020,0.00014,0.00018,0.00015
"This is great, high value women absolutely know how and when to say no. Practice if you need to.",0.00927,0.00012,0.00046,0.00015,0.00047,0.00045
It is. He posted it but can’t reply bc he’s banned.,0.00154,0.00009,0.00022,0.00010,0.00020,0.00014
MGTOW = zero value moids,0.00326,0.00009,0.00034,0.00009,0.00025,0.00015


### Level of toxicity by 1 month before and after introduction of bot

#### Before 1 month

In [133]:
i = 0
n = len(fds_b4_1mo_body)
fds_b4_1mo_body_res = pd.DataFrame()

In [134]:
while i < n:
    res = originalmodel.predict(fds_b4_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_b4_1mo_body[i:i+100]).round(5)
    fds_b4_1mo_body_res = pd.concat([fds_b4_1mo_body_res,f])
    i = i + 100

In [105]:
import numpy as np

In [196]:
#Add flag to each comments
fds_b4_1mo_body_res['toxicity_flag'] = np.where(fds_b4_1mo_body_res['toxicity']>0.5,1,0)
fds_b4_1mo_body_res['severe_toxicity_flag'] = np.where(fds_b4_1mo_body_res['severe_toxicity']>0.01,1,0)
fds_b4_1mo_body_res['obscene_flag'] = np.where(fds_b4_1mo_body_res['obscene']>0.5,1,0)
fds_b4_1mo_body_res['threat_flag'] = np.where(fds_b4_1mo_body_res['threat']>0.0035,1,0)
fds_b4_1mo_body_res['insult_flag'] = np.where(fds_b4_1mo_body_res['insult']>0.5,1,0)
fds_b4_1mo_body_res['identity_attack_flag'] = np.where(fds_b4_1mo_body_res['identity_attack']>0.5,1,0)
print(fds_b4_1mo_body_res.mean())

toxicity                0.247975
severe_toxicity         0.011171
obscene                 0.144904
threat                  0.003408
insult                  0.089196
identity_attack         0.012623
toxicity_flag           0.233987
severe_toxicity_flag    0.137179
obscene_flag            0.147220
threat_flag             0.077765
insult_flag             0.071602
identity_attack_flag    0.003809
post                    0.000000
dtype: float64


#### After - 1 month

In [137]:
#Import AFTER data
fds_aft_1mo =  pd.read_csv('./data/fds_comments_after_1mo.csv')

#Leave out all 'removed' and 'deleted' comments
fds_aft_1mo = fds_aft_1mo[(fds_aft_1mo['body'] != '[deleted]') & (fds_aft_1mo['body'] != '[removed]')]

#Flatten to list of string
fds_aft_1mo_body = list(fds_aft_1mo['body'].values.flatten())

In [138]:
i = 0
n = len(fds_aft_1mo_body)
fds_aft_1mo_body_res = pd.DataFrame()
while i < n:
    res = originalmodel.predict(fds_aft_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_aft_1mo_body[i:i+100]).round(5)
    fds_aft_1mo_body_res = pd.concat([fds_aft_1mo_body_res,f])
    i = i + 100

In [197]:
#Add flag to each comments
fds_aft_1mo_body_res['toxicity_flag'] = np.where(fds_aft_1mo_body_res['toxicity']>0.5,1,0)
fds_aft_1mo_body_res['severe_toxicity_flag'] = np.where(fds_aft_1mo_body_res['severe_toxicity']>0.01,1,0)
fds_aft_1mo_body_res['obscene_flag'] = np.where(fds_aft_1mo_body_res['obscene']>0.5,1,0)
fds_aft_1mo_body_res['threat_flag'] = np.where(fds_aft_1mo_body_res['threat']>0.0035,1,0)
fds_aft_1mo_body_res['insult_flag'] = np.where(fds_aft_1mo_body_res['insult']>0.5,1,0)
fds_aft_1mo_body_res['identity_attack_flag'] = np.where(fds_aft_1mo_body_res['identity_attack']>0.5,1,0)
print(fds_aft_1mo_body_res.mean())

toxicity                0.234422
severe_toxicity         0.010200
obscene                 0.141388
threat                  0.003919
insult                  0.080094
identity_attack         0.009663
toxicity_flag           0.220107
severe_toxicity_flag    0.129891
obscene_flag            0.143533
threat_flag             0.072314
insult_flag             0.061525
identity_attack_flag    0.002384
post                    1.000000
dtype: float64


#### Prelim regression - 1 mo before and after

In [140]:
fds_aft_1mo_body_res['post'] = 1
fds_b4_1mo_body_res['post'] = 0
fds = pd.concat([fds_aft_1mo_body_res, fds_b4_1mo_body_res])

In [124]:
import scipy.stats as sts

##### Score Difference

In [171]:
sts.ttest_ind(fds_aft_1mo_body_res['toxicity'], fds_b4_1mo_body_res['toxicity'])

Ttest_indResult(statistic=-3.7724303017310272, pvalue=0.00016189787809350352)

In [190]:
sts.ttest_ind(fds_aft_1mo_body_res['severe_toxicity'], fds_b4_1mo_body_res['severe_toxicity'])

Ttest_indResult(statistic=-2.223479859960319, pvalue=0.026189020922059277)

In [176]:
sts.ttest_ind(fds_aft_1mo_body_res['obscene'], fds_b4_1mo_body_res['obscene'])

Ttest_indResult(statistic=-1.1542896016259598, pvalue=0.24838835056994285)

In [198]:
sts.ttest_ind(fds_aft_1mo_body_res['threat'], fds_b4_1mo_body_res['threat'])

Ttest_indResult(statistic=1.4622746541176328, pvalue=0.1436738131925582)

In [178]:
sts.ttest_ind(fds_aft_1mo_body_res['insult'], fds_b4_1mo_body_res['insult'])

Ttest_indResult(statistic=-4.466846087339321, pvalue=7.959826618625523e-06)

In [179]:
sts.ttest_ind(fds_aft_1mo_body_res['identity_attack'], fds_b4_1mo_body_res['identity_attack'])

Ttest_indResult(statistic=-5.756451450590391, pvalue=8.652672465096693e-09)

##### Perc Flag Difference

In [182]:
sts.ttest_ind(fds_aft_1mo_body_res['toxicity_flag'], fds_b4_1mo_body_res['toxicity_flag'])

Ttest_indResult(statistic=-3.193084948735773, pvalue=0.0014087041239313205)

In [191]:
sts.ttest_ind(fds_aft_1mo_body_res['severe_toxicity_flag'], fds_b4_1mo_body_res['severe_toxicity_flag'])

Ttest_indResult(statistic=-2.065185341334117, pvalue=0.0389118910833169)

In [184]:
sts.ttest_ind(fds_aft_1mo_body_res['obscene_flag'], fds_b4_1mo_body_res['obscene_flag'])

Ttest_indResult(statistic=-1.0064083867844758, pvalue=0.3142252454463769)

In [199]:
sts.ttest_ind(fds_aft_1mo_body_res['threat_flag'], fds_b4_1mo_body_res['threat_flag'])

Ttest_indResult(statistic=-1.997231029203074, pvalue=0.04580685006941343)

In [186]:
sts.ttest_ind(fds_aft_1mo_body_res['insult_flag'], fds_b4_1mo_body_res['insult_flag'])

Ttest_indResult(statistic=-3.9231298449459744, pvalue=8.75518843472234e-05)

In [187]:
sts.ttest_ind(fds_aft_1mo_body_res['identity_attack_flag'], fds_b4_1mo_body_res['identity_attack_flag'])

Ttest_indResult(statistic=-2.5454954886911305, pvalue=0.010915994814353359)

### By Day

In [207]:
####Extract date from before and after files
fds_aft_1mo['created_utc']

0        2019-11-06 14:23:47
1        2019-11-06 14:23:05
2        2019-11-06 14:21:31
3        2019-11-06 14:18:38
4        2019-11-06 14:18:36
                ...         
32368    2019-11-28 16:36:17
32369    2019-11-28 16:36:12
32370    2019-11-28 16:35:31
32371    2019-11-28 16:33:46
32372    2019-11-28 16:32:40
Name: created_utc, Length: 32373, dtype: object

In [174]:
fds_b4_1mo_body_res.iloc[2]

toxicity                0.78609
severe_toxicity         0.01537
obscene                 0.71144
threat                  0.00280
insult                  0.47489
identity_attack         0.02185
toxicity_flag           1.00000
severe_toxicity_flag    0.00000
obscene_flag            1.00000
threat_flag             0.00000
insult_flag             0.00000
identity_attack_flag    0.00000
post                    0.00000
Name: That's exactly it sister. Its setting the tone, and you have to be real clear with men about that otherwise they dupe you fast, every womans learned that the hard way\r\n\r\nThey know full well why we make them do it, most women are looking to get taken seriously by a man and only sleep with him, however most men are looking to 'pump and dump' so this is just another way of making him put his money where his mouth is, making it more expensive, making him think twice\r\n\r\nPlus when I've been nothin but genuine to a guy and he turns out to be a rotten lying fuckboy that l