In [1]:
from detoxify import Detoxify

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd

In [4]:
import numpy as np

## Tutorial

In [11]:
# each model takes in either a string or a list of strings
results = Detoxify('original').predict('example text')

In [20]:
results = Detoxify('unbiased').predict(['example text 1','example text 2'])

In [14]:
results = Detoxify('multilingual').predict(['example text','exemple de texte','texto de ejemplo','testo di esempio','texto de exemplo','örnek metin','пример текста'])

In [13]:
# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
model = Detoxify('original', device='cuda')

In [3]:
import pandas as pd

In [4]:
results_df = pd.DataFrame(results).round(8)

In [15]:
results_df

Unnamed: 0,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
0,0.000196,0.000193,0.001263,0.000323,0.000883,0.000138,9e-05
1,0.000563,0.00481,0.030099,0.005532,0.026441,0.002134,0.001036
2,0.000628,0.003142,0.022164,0.003467,0.017627,0.001497,0.000616
3,0.000992,0.004267,0.033336,0.005408,0.028724,0.001951,0.000833
4,0.000612,0.001548,0.012159,0.001841,0.009405,0.000858,0.000362
5,0.000591,0.002832,0.021326,0.003134,0.017865,0.001327,0.000616
6,0.000535,0.003826,0.026462,0.00403,0.021902,0.001632,0.000747


In [19]:
#Get Toxicity Mean
results_df['toxicity'].mean()

0.0005879871428571428

In [20]:
#Get all column Mean
results_df.mean()

toxicity           0.000588
severe_toxicity    0.002945
obscene            0.020973
identity_attack    0.003391
insult             0.017550
threat             0.001362
sexual_explicit    0.000614
dtype: float64

## Analyze FDS

## ANALYZE Comments:
- Comments by AutoModerator
- Comments deleted (deleted by bots) and removed (users removed)

In [5]:
#Import BEFORE data
fds_b4_1mo =  pd.read_csv('./data/fds_comments_before_1mo.csv')


In [20]:
#Leave out all 'removed' and 'deleted' comments
fds_b4_1mo_df = fds_b4_1mo[(fds_b4_1mo['body'] != '[deleted]') & (fds_b4_1mo['body'] != '[removed]') & (fds_b4_1mo['author'] != 'AutoModerator')]

#Flatten to list of string
fds_b4_1mo_body = list(fds_b4_1mo_df['body'].values.flatten())

In [7]:
#Import AFTER data
fds_aft_1mo =  pd.read_csv('./data/fds_comments_after_1mo.csv')

In [21]:
#Leave out all 'removed' and 'deleted' comments
fds_aft_1mo_df = fds_aft_1mo[(fds_aft_1mo['body'] != '[deleted]') & (fds_aft_1mo['body'] != '[removed]') & (fds_aft_1mo['author'] !='AutoModerator')]

#Flatten to list of string
fds_aft_1mo_body = list(fds_aft_1mo_df['body'].values.flatten())

## ANALYZE Submission
Concerns:
- Structure of a submission: title, body (`self_text`), images/video link (`domain`)

In [35]:
#Import SUBMISSION BEFORE
fds_b4_1mo_sub = pd.read_csv('./data/fds_subm_before_1mo.csv')

In [13]:
#Number of no content submissions
fds_b4_1mo_sub['selftext'].isnull().sum()

1869

In [36]:
#Percentage of no content submissions
fds_b4_1mo_sub['selftext'].isnull().sum()/len(fds_b4_1mo_sub['selftext'] != '[removed]')*100

46.760070052539405

In [42]:
# Condition: domain = i.reddit.it OR imugur AND selftext = ""
len(fds_b4_1mo_sub[((fds_b4_1mo_sub['domain'] == 'i.redd.it') | (fds_b4_1mo_sub['domain'] == 'i.imgur.com') |  (fds_b4_1mo_sub['domain'] == 'imgur.com'))  & fds_b4_1mo_sub['selftext'].isnull()]) 

1136

In [41]:
# Condition: domain = i.reddit.it OR imugur. So basically they are the same.
len(fds_b4_1mo_sub[(fds_b4_1mo_sub['domain'] == 'i.redd.it') | (fds_b4_1mo_sub['domain'] == 'i.imgur.com') |  (fds_b4_1mo_sub['domain'] == 'imgur.com')]) 

1141

In [33]:
#Number of image submissions
len(fds_b4_1mo_sub[(fds_b4_1mo_sub['domain'] == 'self.FemaleDatingStrategy')]) 

2185

In [34]:
# Save only submissions has written content and not deleted or removed
fds_b4_1mo_sub_test = fds_b4_1mo_sub[(fds_b4_1mo_sub['selftext'].astype(bool)) | (fds_b4_1mo_sub['selftext'] == '[removed]')]

## Set up Model

In [22]:
#Set pre-trained model and run on GPU
originalmodel = Detoxify('unbiased', device='cuda')

### Example

In [23]:
#Run the first batch
res = originalmodel.predict(fds_b4_1mo_body[0:100])

In [24]:
#connect with body text
pd.DataFrame(res,fds_b4_1mo_body[0:100]).round(5)

Unnamed: 0,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
...It's more of a sarcastic nice-nasty way of saying what you did.,0.31566,0.00002,0.00252,0.00210,0.24342,0.00037,0.00075
"Nope. Men are on average more likely to lie to a woman, so therefore you have to do more to convince me you're not wasting my time. I am not a teenager anymore I am mid 20s",0.01017,0.00004,0.00024,0.00699,0.00209,0.00025,0.00104
"That's exactly it sister. Its setting the tone, and you have to be real clear with men about that otherwise they dupe you fast, every womans learned that the hard way\r\n\r\nThey know full well why we make them do it, most women are looking to get taken seriously by a man and only sleep with him, however most men are looking to 'pump and dump' so this is just another way of making him put his money where his mouth is, making it more expensive, making him think twice\r\n\r\nPlus when I've been nothin but genuine to a guy and he turns out to be a rotten lying fuckboy that led me on, I feel better I wasted £300 of his money that weekend ;)",0.65407,0.00356,0.43520,0.04374,0.41763,0.00119,0.04932
"About two-three months. I waited for a few signs. \r\n\r\n- In the beginning I set expectations clear that I wasn’t dating to mess around, I was dating to find marriage. He proceeded in dating me after he knew this. \r\n\r\n- Asked me to be his girlfriend. I said no, but I said it with the caveat that I wasn’t interested in having an LTR; I wanted a husband, remember? He still proceeded to date me. \r\n\r\n- Paid for a WONDERFUL trip for my birthday. Absolutely stunning: he took notes and clearly knew what I liked, what I was into, etc. Planned the whole thing. \r\n\r\nI slept with him after that. Even then I didn’t do a whole lot of pleasing until closer to our engagement. Being a pillow princess is fun.",0.00161,0.00000,0.00023,0.00019,0.00042,0.00004,0.00024
"no he was a white guy, but I get this comment from a few rare insecure white men, cos I'm curvier, as in curvier not delusional and overweight (important to point out nowadays lol). I think they get some complex about it , if black men look at me in public and I also think they're wondering what size I've had or some BS...but anyway it really came off as lowkey racist, which is an instant turn off for me. I know what those undertone comments allude to. Tbh he made quite a few OFF key comments during that date",0.21046,0.00039,0.00276,0.24676,0.02893,0.00116,0.00087
...,...,...,...,...,...,...,...
Thank you!,0.00047,0.00000,0.00003,0.00007,0.00012,0.00002,0.00001
You can't negotiate attraction,0.00063,0.00000,0.00002,0.00014,0.00013,0.00003,0.00002
"\r\n\r\nI said this to another poster and I think it applies to your question: \r\n\r\nI can see your concern- I love transparency and sharing my feelings; you can’t get what you want if you’re too afraid to speak up for it. All that said- I think of this advice when I’ve said my peace, set my expectations and then these dummies are STILL trying to clown me. I’m not in the business of arguing with men who want to be obtuse.",0.45064,0.00004,0.00325,0.00194,0.45550,0.00015,0.00049
"Honestly, this poor woman has no real sense of herself or what it means to be independent and autonomous as a single woman. I really think she jumped from the frying pan into the fire (another relationship) without giving herself time to understand the dynamics of the previous relationship, as in she can't really be in a healthy relationship without working on herself first, and giving herself time to regroup and heal. Not to mention she's really young, and so is the boyfriend. I think they both need to be enjoying a single life but she especially needs to do some self-examination in therapy with an objective person.",0.00361,0.00001,0.00010,0.00185,0.00094,0.00014,0.00038


### Level of toxicity by 1 month before and after introduction of bot

#### Before 1 month

In [25]:
i = 0
n = len(fds_b4_1mo_body)
fds_b4_1mo_body_res = pd.DataFrame()

In [26]:
while i < n:
    res = originalmodel.predict(fds_b4_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_b4_1mo_body[i:i+100]).round(5)
    fds_b4_1mo_body_res = pd.concat([fds_b4_1mo_body_res,f])
    i = i + 100

In [27]:
#Add flag to each comments
fds_b4_1mo_body_res['toxicity_flag'] = np.where(fds_b4_1mo_body_res['toxicity']>0.5,1,0)
fds_b4_1mo_body_res['severe_toxicity_flag'] = np.where(fds_b4_1mo_body_res['severe_toxicity']>0.01,1,0)
fds_b4_1mo_body_res['obscene_flag'] = np.where(fds_b4_1mo_body_res['obscene']>0.5,1,0)
fds_b4_1mo_body_res['threat_flag'] = np.where(fds_b4_1mo_body_res['threat']>0.0035,1,0)
fds_b4_1mo_body_res['insult_flag'] = np.where(fds_b4_1mo_body_res['insult']>0.5,1,0)
fds_b4_1mo_body_res['identity_attack_flag'] = np.where(fds_b4_1mo_body_res['identity_attack']>0.5,1,0)
print(fds_b4_1mo_body_res.mean())

toxicity                0.239328
severe_toxicity         0.005412
obscene                 0.133117
identity_attack         0.016930
insult                  0.118515
threat                  0.003724
sexual_explicit         0.078196
toxicity_flag           0.234986
severe_toxicity_flag    0.089187
obscene_flag            0.141755
threat_flag             0.073236
insult_flag             0.101393
identity_attack_flag    0.005017
dtype: float64


#### After - 1 month

In [137]:
#Import AFTER data
fds_aft_1mo =  pd.read_csv('./data/fds_comments_after_1mo.csv')

#Leave out all 'removed' and 'deleted' comments
fds_aft_1mo = fds_aft_1mo[(fds_aft_1mo['body'] != '[deleted]') & (fds_aft_1mo['body'] != '[removed]')]

#Flatten to list of string
fds_aft_1mo_body = list(fds_aft_1mo['body'].values.flatten())

In [138]:
i = 0
n = len(fds_aft_1mo_body)
fds_aft_1mo_body_res = pd.DataFrame()
while i < n:
    res = originalmodel.predict(fds_aft_1mo_body[i:i+100])
    f = pd.DataFrame(res,fds_aft_1mo_body[i:i+100]).round(5)
    fds_aft_1mo_body_res = pd.concat([fds_aft_1mo_body_res,f])
    i = i + 100

In [197]:
#Add flag to each comments
fds_aft_1mo_body_res['toxicity_flag'] = np.where(fds_aft_1mo_body_res['toxicity']>0.5,1,0)
fds_aft_1mo_body_res['severe_toxicity_flag'] = np.where(fds_aft_1mo_body_res['severe_toxicity']>0.01,1,0)
fds_aft_1mo_body_res['obscene_flag'] = np.where(fds_aft_1mo_body_res['obscene']>0.5,1,0)
fds_aft_1mo_body_res['threat_flag'] = np.where(fds_aft_1mo_body_res['threat']>0.0035,1,0)
fds_aft_1mo_body_res['insult_flag'] = np.where(fds_aft_1mo_body_res['insult']>0.5,1,0)
fds_aft_1mo_body_res['identity_attack_flag'] = np.where(fds_aft_1mo_body_res['identity_attack']>0.5,1,0)
print(fds_aft_1mo_body_res.mean())

toxicity                0.234422
severe_toxicity         0.010200
obscene                 0.141388
threat                  0.003919
insult                  0.080094
identity_attack         0.009663
toxicity_flag           0.220107
severe_toxicity_flag    0.129891
obscene_flag            0.143533
threat_flag             0.072314
insult_flag             0.061525
identity_attack_flag    0.002384
post                    1.000000
dtype: float64


#### Prelim regression - 1 mo before and after

In [140]:
fds_aft_1mo_body_res['post'] = 1
fds_b4_1mo_body_res['post'] = 0
fds = pd.concat([fds_aft_1mo_body_res, fds_b4_1mo_body_res])

In [124]:
import scipy.stats as sts

##### Score Difference

In [171]:
sts.ttest_ind(fds_aft_1mo_body_res['toxicity'], fds_b4_1mo_body_res['toxicity'])

Ttest_indResult(statistic=-3.7724303017310272, pvalue=0.00016189787809350352)

In [190]:
sts.ttest_ind(fds_aft_1mo_body_res['severe_toxicity'], fds_b4_1mo_body_res['severe_toxicity'])

Ttest_indResult(statistic=-2.223479859960319, pvalue=0.026189020922059277)

In [176]:
sts.ttest_ind(fds_aft_1mo_body_res['obscene'], fds_b4_1mo_body_res['obscene'])

Ttest_indResult(statistic=-1.1542896016259598, pvalue=0.24838835056994285)

In [198]:
sts.ttest_ind(fds_aft_1mo_body_res['threat'], fds_b4_1mo_body_res['threat'])

Ttest_indResult(statistic=1.4622746541176328, pvalue=0.1436738131925582)

In [178]:
sts.ttest_ind(fds_aft_1mo_body_res['insult'], fds_b4_1mo_body_res['insult'])

Ttest_indResult(statistic=-4.466846087339321, pvalue=7.959826618625523e-06)

In [179]:
sts.ttest_ind(fds_aft_1mo_body_res['identity_attack'], fds_b4_1mo_body_res['identity_attack'])

Ttest_indResult(statistic=-5.756451450590391, pvalue=8.652672465096693e-09)

##### Perc Flag Difference

In [182]:
sts.ttest_ind(fds_aft_1mo_body_res['toxicity_flag'], fds_b4_1mo_body_res['toxicity_flag'])

Ttest_indResult(statistic=-3.193084948735773, pvalue=0.0014087041239313205)

In [191]:
sts.ttest_ind(fds_aft_1mo_body_res['severe_toxicity_flag'], fds_b4_1mo_body_res['severe_toxicity_flag'])

Ttest_indResult(statistic=-2.065185341334117, pvalue=0.0389118910833169)

In [184]:
sts.ttest_ind(fds_aft_1mo_body_res['obscene_flag'], fds_b4_1mo_body_res['obscene_flag'])

Ttest_indResult(statistic=-1.0064083867844758, pvalue=0.3142252454463769)

In [199]:
sts.ttest_ind(fds_aft_1mo_body_res['threat_flag'], fds_b4_1mo_body_res['threat_flag'])

Ttest_indResult(statistic=-1.997231029203074, pvalue=0.04580685006941343)

In [186]:
sts.ttest_ind(fds_aft_1mo_body_res['insult_flag'], fds_b4_1mo_body_res['insult_flag'])

Ttest_indResult(statistic=-3.9231298449459744, pvalue=8.75518843472234e-05)

In [187]:
sts.ttest_ind(fds_aft_1mo_body_res['identity_attack_flag'], fds_b4_1mo_body_res['identity_attack_flag'])

Ttest_indResult(statistic=-2.5454954886911305, pvalue=0.010915994814353359)

### By Day

In [223]:
####Extract date from before and after files
fds_aft_1mo_df['Date'] = pd.to_datetime(fds_aft_1mo_df['created_utc'], format='%Y-%m-%d %H:%M:%S')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fds_aft_1mo_df['Date'] = pd.to_datetime(fds_aft_1mo_df['created_utc'], format='%Y-%m-%d %H:%M:%S')


In [234]:
fds_nov = pd.concat([fds_aft_1mo_body_res.reset_index(),fds_aft_1mo_df['Date'].reset_index()], axis = 1)

Unnamed: 0_level_0,Unnamed: 0,score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-10-28,49512.062914,1.67053
2019-10-29,48592.601346,1.275862
2019-10-30,42138.934242,1.034483
2019-10-31,40132.600719,1.02518
2019-11-01,26584.033586,1.0
2019-11-02,29955.290667,2.314667
2019-11-03,38198.418579,3.325683
2019-11-04,34255.59883,2.051462
2019-11-05,40194.826509,1.029095
2019-11-06,36807.615523,1.079422


In [174]:
fds_b4_1mo_body_res.iloc[2]

toxicity                0.78609
severe_toxicity         0.01537
obscene                 0.71144
threat                  0.00280
insult                  0.47489
identity_attack         0.02185
toxicity_flag           1.00000
severe_toxicity_flag    0.00000
obscene_flag            1.00000
threat_flag             0.00000
insult_flag             0.00000
identity_attack_flag    0.00000
post                    0.00000
Name: That's exactly it sister. Its setting the tone, and you have to be real clear with men about that otherwise they dupe you fast, every womans learned that the hard way\r\n\r\nThey know full well why we make them do it, most women are looking to get taken seriously by a man and only sleep with him, however most men are looking to 'pump and dump' so this is just another way of making him put his money where his mouth is, making it more expensive, making him think twice\r\n\r\nPlus when I've been nothin but genuine to a guy and he turns out to be a rotten lying fuckboy that l