## My method

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

dataset_dir = '/content/drive/My Drive/Incivility/Perspective/Perspective_BERT/datasets/'
model_save_dir = '/content/drive/My Drive/Incivility/Perspective/Perspective_BERT/model/'

In [None]:
!pip install transformers==2.6.0

In [None]:
import numpy as np
import torch
import transformers
from tqdm import tqdm, trange
from nltk import word_tokenize, sent_tokenize

from torch.utils import data
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, WeightedRandomSampler
from transformers import BertTokenizer, BertConfig
from torch.utils.tensorboard import SummaryWriter

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from transformers import BertForTokenClassification, AdamW, BertModel, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup

from sklearn.metrics import f1_score, accuracy_score, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, StratifiedKFold # if needed later
from torch.nn import CrossEntropyLoss, MSELoss 

import csv

import pandas
import random

In [None]:
# Function to tokenize sentences for BERT
def tokenize_and_preserve(sentence, tokenizer):
	tokenized_sentence = []

	for word in sentence:

		# Tokenize the word and count # of subwords the word is broken into
		tokenized_word = tokenizer.tokenize(word)
		n_subwords = len(tokenized_word)

		# Add the tokenized word to the final tokenized word list
		tokenized_sentence.extend(tokenized_word)

	return tokenized_sentence

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
class CivilityRegressor(torch.nn.Module):
  def __init__(self):
    super(CivilityRegressor, self).__init__()
    # Need to freeze this, and learn weights on other layers
    # self.bert = BertModel.from_pretrained("bert-base-uncased", output_attentions=False)
    # set_parameter_requires_grad(self.bert, feature_extracting=False)
    # self.drop = torch.nn.Dropout(p=0.3)
    # self.out = torch.nn.Linear(768, 1)

    self.bert = BertForSequenceClassification.from_pretrained(
        "bert-base-cased", # Use the 12-layer BERT model, with an uncased vocab.
        num_labels = 2, # The number of output labels--2 for binary classification.
                        # You can increase this for multi-class tasks. 
        # num_labels = 1,  
        output_attentions = False, # Whether the model returns attentions weights.
        output_hidden_states = False, # Whether the model returns all hidden-states.
    )
    # Freeze the BERT model
    
    for name, param in self.bert.named_parameters():
        if 'classifier' not in name:
            param.requires_grad = False
        else:
            print(name)
  
  def forward(self, input_ids, attention_mask, labels):
    # _, pooled_output, attention = self.bert(
    #   input_ids=input_ids,
    #   attention_mask=attention_mask
    # )
    # output = self.drop(pooled_output)
    # return self.out(features)
    
    loss, logits = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels=labels
    )
    return loss, logits


model = CivilityRegressor()
print(model.bert.classifier.weight)
model.cuda();
checkpoint = torch.load('{}model.pt'.format(model_save_dir+'Classification_first/'))
model.load_state_dict(checkpoint)
print(model.bert.classifier.weight)

In [None]:
print(model.bert.classifier.weight)

In [None]:
model.eval()
input_file_1 = '/content/drive/My Drive/Backup/Research/Incivility/Perspective/Toxicity_Error_Analysis_Train_Set_1_100k.tsv'

output_file_1 = 'Toxicity_Error_Analysis_BERT_Train_Set_3_80k.tsv'

with open(input_file_1) as f, open(output_file_1, 'w') as w:
    header = f.readline()
    reader = csv.reader(f, delimiter="\t")
    writer = csv.writer(w)
    for i, row in enumerate(reader):

        if i%100 == 0:
            print(i)
        new_row = list(filter(('').__ne__, row))[:5]
        if len(new_row) != 5:
            print(new_row[0])
        text = new_row[1]

        # Tokenize sentences
        tokenized_text = tokenize_and_preserve(text, tokenizer)
        # Get IDs for inputs and perform padding
        input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(tokenized_text)],
            maxlen=256, dtype="long", value=0.0,
            truncating="post", padding="post")
        # print(input_ids.dtype)

        # Obtain labels, scores and attention masks
        attention_masks = np.array([float(i != 0.0) for i in input_ids[0]])

        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            loss, logits = model(torch.tensor(input_ids[0]).view(1, -1).cuda(), attention_mask=torch.tensor(attention_masks).view(1, -1).cuda(), labels=torch.ones((1, 1), dtype=torch.long).cuda())

        # Move logits and labels to CPU
        scores = logits[:, 1].detach().cpu().numpy()
        # prob = torch.sigmoid(logits)
        # scores = prob[:, 1]
        new_row.append(scores.item())

        writer.writerow(new_row)

## Other method

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 8.3MB/s 
[?25hCollecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/f1/99/43e5571005c792284276986eabd956699fac65d283df409b1482ca8722d8/boto3-1.17.67-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 10.4MB/s 
Collecting jmespath<1.0.0,>=0.7.1
  Downloading https://files.pythonhosted.org/packages/07/cb/5f001272b6faeb23c1c9e0acc04d48eaaf5c862c17709d20e3469c6e0139/jmespath-0.10.0-py2.py3-none-any.whl
Collecting botocore<1.21.0,>=1.20.67
[?25l  Downloading https://files.pythonhosted.org/packages/82/2b/6a23d63e1b9593919fbe622596fe92e02e3abcec7d2f91594443ee6e4ef9/botocore-1.20.67-py2.py3-none-any.whl (7.5MB)
[K     |████████████████████████████████| 7.5MB 8.2MB/s 
[?25hCollect

In [3]:
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 881181.95B/s]


In [5]:
# Converting the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
from tqdm import tqdm, tqdm_notebook
import numpy as np
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    # for text in tqdm_notebook(example):
    for text in example:
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    # print(longer)
    return np.array(all_tokens)

In [6]:
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
import csv
# train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), torch.tensor(y,dtype=torch.float))

In [7]:
#Re-load model from file
device=torch.device('cuda')
# path = "/content/gdrive/My Drive/Incivility/Perspective/Perspective_BERT/bert_pytorch_0.5_larger.bin"
path = "/content/gdrive/My Drive/Incivility/Perspective/Perspective_BERT/Final/Data/bert_pytorch_0.5_thresh.bin"
# path = "/content/gdrive/My Drive/Incivility/Perspective/Perspective_BERT/Retrained/bert_pytorch_0.5_retrained.bin"
y_columns=['target']
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=len(y_columns))
model.load_state_dict(torch.load(path))
model.to(device)
MAX_SEQUENCE_LENGTH = 128

100%|██████████| 407873900/407873900 [00:10<00:00, 39354259.42B/s]


In [8]:
for param in model.parameters():
    param.requires_grad=False
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [None]:
# Overpredictions and underpredictions

def make_clean_dataset(dataset):
    count = 0
    cleaned_dataset = []
    for idx, line in enumerate(dataset):

        if idx % 100000 == 0:
            print(idx)

        comment, remaining = line[1], line[2:]

        if comment.count('\t') > 3:    
            # print('\n Trying to fix **', line)          
            new_lines = comment.split('\n')
            
            first_row = new_lines[0].split('\t')
            first_row.insert(0, line[0])
            cleaned_dataset.append(first_row)
            # print('^ ', first_row)
            for linex in new_lines[1:-1]:
                fields = linex.split('\t')
                # print('& ', fields)
                cleaned_dataset.append(fields)
            
            last_row = new_lines[-1].split('\t')
            last_row.extend(remaining)
            cleaned_dataset.append(last_row)
            # print('$ ', last_row)
            count += len(new_lines)
            # break
        else:
            count += 1
            cleaned_dataset.append(line)

    print(count, len(cleaned_dataset))
    return cleaned_dataset

In [None]:
input_file_1 = '/content/gdrive/My Drive/Backup/Research/Incivility/Perspective/Toxicity_Error_Analysis_Train_Set_1_100k.tsv'
output_file_1 = '/content/gdrive/My Drive/Backup/Research/Incivility/Perspective/Toxicity_Error_Analysis_BERT_Train_Set_1_Final_Retrained.tsv'

dataset = []
with open(input_file_1) as f, open(output_file_1, 'w') as w:
    header = f.readline()
    reader = csv.reader(f, delimiter="\t")
    writer = csv.writer(w, delimiter="\t")
    for i, row in enumerate(reader):

        if i%10000 == 0:
            print(i)
        
        # if i == 100:
        #     break
        new_row = list(filter(('').__ne__, row))[:5]
        # if len(new_row) != 5:
        #     print(new_row[0])
        dataset.append(new_row)

    cleaned_dataset = make_clean_dataset(dataset)

    for i, row in enumerate(cleaned_dataset):
        if i%10000 == 0:
            print(i)

        text = row[1]
        label = float(row[2])

        input = torch.tensor(convert_lines([text], MAX_SEQUENCE_LENGTH, tokenizer), dtype=torch.long)
        
        pred = model(input.to(device), attention_mask=(input>0).to(device), labels=None)
        score = torch.sigmoid(pred).item()
        
        error = label - score

        row.append(score)
        row.append(error)
        writer.writerow(row)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
0
100000
101596 101596
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


In [None]:
input_file_2 = '/content/gdrive/My Drive/Backup/Research/Incivility/Perspective/Toxicity_Error_Analysis_Train_Set_2_100k.tsv'
output_file_2 = '/content/gdrive/My Drive/Backup/Research/Incivility/Perspective/Toxicity_Error_Analysis_BERT_Train_Set_2_Final_Retrained.tsv'

dataset = []
with open(input_file_2) as f, open(output_file_2, 'w') as w:
    header = f.readline()
    reader = csv.reader(f, delimiter="\t")
    writer = csv.writer(w, delimiter="\t")
    for i, row in enumerate(reader):

        if i%10000 == 0:
            print(i)
        
        # if i == 100:
        #     break
        new_row = list(filter(('').__ne__, row))[:5]
        # if len(new_row) != 5:
        #     print(new_row[0])
        dataset.append(new_row)

    cleaned_dataset = make_clean_dataset(dataset)

    for i, row in enumerate(cleaned_dataset):
        if i%10000 == 0:
            print(i)

        text = row[1]
        label = float(row[2])

        input = torch.tensor(convert_lines([text], MAX_SEQUENCE_LENGTH, tokenizer), dtype=torch.long)
        
        pred = model(input.to(device), attention_mask=(input>0).to(device), labels=None)
        score = torch.sigmoid(pred).item()
        
        error = label - score

        row.append(score)
        row.append(error)
        writer.writerow(row)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
0
95510 95510
0
10000
20000
30000
40000
50000
60000
70000
80000
90000


In [None]:
input_file_3 = '/content/gdrive/My Drive/Backup/Research/Incivility/Perspective/Toxicity_Error_Analysis_Train_Set_3_100k.tsv'
output_file_3 = '/content/gdrive/My Drive/Backup/Research/Incivility/Perspective/Toxicity_Error_Analysis_BERT_Train_Set_3_Final_Retrained.tsv'

dataset = []
with open(input_file_3) as f, open(output_file_3, 'w') as w:
    header = f.readline()
    reader = csv.reader(f, delimiter="\t")
    writer = csv.writer(w, delimiter="\t")
    for i, row in enumerate(reader):

        if i%10000 == 0:
            print(i)
        
        # if i == 100:
        #     break
        new_row = list(filter(('').__ne__, row))[:5]
        # if len(new_row) != 5:
        #     print(new_row[0])
        dataset.append(new_row)

    cleaned_dataset = make_clean_dataset(dataset)

    for i, row in enumerate(cleaned_dataset):
        if i%10000 == 0:
            print(i)

        text = row[1]
        label = float(row[2])

        input = torch.tensor(convert_lines([text], MAX_SEQUENCE_LENGTH, tokenizer), dtype=torch.long)
        
        pred = model(input.to(device), attention_mask=(input>0).to(device), labels=None)
        score = torch.sigmoid(pred).item()
        
        error = label - score

        row.append(score)
        row.append(error)
        writer.writerow(row)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
0
100000
103700 103700
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


## Template Analysis on BERT error triggers

In [9]:
def get_score(text):
    input = torch.tensor(convert_lines([text], MAX_SEQUENCE_LENGTH, tokenizer), dtype=torch.long)
    pred = model(input.to(device), attention_mask=(input>0).to(device), labels=None)
    score = torch.sigmoid(pred).item()
    return score

In [10]:
import csv
# SCORES MIGHT SLIGHTLY DIFFER WHEN RUNNING AFTER FREEZE. SAME WITH ABOVE SCORES AND REGRESSION ON 300k. Ignore the "_Retrained" scores, "_fixed"
# templates = ['I am talking about _____', 'She did not say _____', 'We support _____']
templates = ['We wrote _____ on the page.','They whispered _____ to themselves. ','I erased the word _____ from the page.','You didn’t write _____.','Did he say _____ ?']

# trigger_words = []
# with open('/content/gdrive/My Drive/Incivility/Perspective/Perspective_Error_Analysis/error_trigger_densities/all_errors/Regression_Words_BERT.csv') as f:
#     header = f.readline()
#     reader = csv.reader(f)
#     trigger_words = [line[0] for line in reader]

# overprediction = trigger_words[-500:]

# overprediction = ['woman', 'black', 'feminist', 'asian', 'blacks', 'males', 'migrants', 'female', 'christian', 'gays', 'homosexual', 'babies', 'african', 'dictator', 'mexicans', 'immigrants', 'girl', 'catholics', 'jewish', 'women', 'muslim', 'islamic', 'gay', 'muslims', 'jews']

# overprediction = ['dictators', 'refugee', 'feminist', 'communist', 'catholic', 'migrants', 'homosexual', 'babies', 'mexicans', 'girl', 'islamic', 'woman', 'female', 'african', 'muslims', 'muslim', 'jews', 'blacks', 'black', 'jewish', 'gay', 'gays', 'women', 'dictator', 'catholics', 'police']

overprediction = ['americans', 'baby', 'barack', 'boy', 'british', 'capitalist', 'catholic', 'child', 'chinese', 'communist', 'cop', 'cops', 'democrat', 'democrats', 'dictators', 'english', 'europeans', 'german', 'guy', 'hillary', 'irish', 'joseph', 'leftist', 'leftists', 'liberal', 'liberals', 'male', 'man', 'mexican', 'mom', 'mother', 'obama', 'police', 'potus', 'priests', 'progressives', 'putin', 'refugee', 'republicans', 'russians', 'tribal', 'trudeau']

with open('/content/gdrive/My Drive/Incivility/Perspective/Perspective_Error_Analysis/error_trigger_densities/all_errors/Templates_BERT_persons_leftover.csv', 'w') as w:
    writer = csv.writer(w)
    writer.writerow(['error trigger', templates[0], templates[1], templates[2], templates[3], templates[4]])
    for i, word in enumerate(overprediction):
        t0 = templates[0].replace('_____', word)
        t1 = templates[1].replace('_____', word)
        t2 = templates[2].replace('_____', word)
        t3 = templates[3].replace('_____', word)
        t4 = templates[4].replace('_____', word)
        row = [word, get_score(t0), get_score(t1), get_score(t2), get_score(t3), get_score(t4)]
        writer.writerow(row)
        print(i+1, word)

1 americans
2 baby
3 barack
4 boy
5 british
6 capitalist
7 catholic
8 child
9 chinese
10 communist
11 cop
12 cops
13 democrat
14 democrats
15 dictators
16 english
17 europeans
18 german
19 guy
20 hillary
21 irish
22 joseph
23 leftist
24 leftists
25 liberal
26 liberals
27 male
28 man
29 mexican
30 mom
31 mother
32 obama
33 police
34 potus
35 priests
36 progressives
37 putin
38 refugee
39 republicans
40 russians
41 tribal
42 trudeau


## News Shows data

In [None]:
import csv

with open('/content/gdrive/My Drive/Incivility/Perspective/Perspective_BERT/snippet_data.csv') as f:
    header = f.readline().split(',')
    reader = csv.reader(f)
    data = [line for line in reader]

with open('/content/gdrive/My Drive/Incivility/Perspective/Perspective_BERT/snippet_data_BERT.csv', 'w') as w:
    writer = csv.writer(w)
    header = ['Clip', 'Text', 'Human score', 'Perspective score', 'BERT-toxicity']
    writer.writerow(header)
    
    for i, line in enumerate(data):
        score = get_score(line[1])
        line.append(score) 
        writer.writerow(line)
        print(i+1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219


## Statictical significance for news data

In [None]:
'''
Script to compute statistical significance tests on the snippet level data (human and Perspective rated)
'''
import csv
from scipy.stats import ttest_ind, mannwhitneyu
import numpy as np
import itertools 

scores = {'FOX': [], 'MSNBC': [], 'PBS': []}

with open('/content/gdrive/My Drive/Incivility/Perspective/Perspective_BERT/snippet_data_BERT.csv') as f:
	header = f.readline()
	reader = csv.reader(f)
	data = [row for row in reader]
	for row in data:
		data = np.array([float(row[2]), float(row[3]), float(row[4])])
		
		if 'FOX' in row[0]:
			scores['FOX'].append(data)
		if 'MSNBC' in row[0]:
			scores['MSNBC'].append(data)
		if 'PBS' in row[0]:
			scores['PBS'].append(data)

scores['FOX'] = np.array(scores['FOX'])
scores['MSNBC'] = np.array(scores['MSNBC'])
scores['PBS'] = np.array(scores['PBS'])

pair_list = list(itertools.combinations(list(scores.keys()), 2))

for show1, show2 in pair_list:
	print(show1, show2)

	# print('human video scores')
	# # Z, p = ttest_ind(scores[show1][:, 0], scores[show2][:, 0])
	# # print('T-test pvalue: '+str(p))

	# Z, p = mannwhitneyu(scores[show1][:, 0], scores[show2][:, 0])
	# print('Mann Whitney U-test pvalue: '+str(p))

	print('xxxxxxx\n')

	print('human text scores')
	# Z, p = ttest_ind(scores[show1][:, 1], scores[show2][:, 1])
	# print('T-test pvalue: '+str(p))

	Z, p = mannwhitneyu(scores[show1][:, 0], scores[show2][:, 0])
	print('Mann Whitney U-test pvalue: '+str(p))

	print('xxxxxxx\n')

	print('perspective scores')
	# Z, p = ttest_ind(scores[show1][:, 2], scores[show2][:, 2])
	# print('T-test pvalue: '+str(p))

	Z, p = mannwhitneyu(scores[show1][:, 1], scores[show2][:, 1])
	print('Mann Whitney U-test pvalue: '+str(p))

	print('xxxxxxx\n')

	# print('offensive scores')
	# Z, p = ttest_ind(scores[show1][:, 3], scores[show2][:, 3])
	# print('T-test pvalue: '+str(p))

	# Z, p = mannwhitneyu(scores[show1][:, 3], scores[show2][:, 3])
	# print('Mann Whitney U-test pvalue: '+str(p))

	print('bert scores')
	# Z, p = ttest_ind(scores[show1][:, 3], scores[show2][:, 3])
	# print('T-test pvalue: '+str(p))

	Z, p = mannwhitneyu(scores[show1][:, 2], scores[show2][:, 2])
	print('Mann Whitney U-test pvalue: '+str(p))


	print('-----------------------------\n')

FOX MSNBC
xxxxxxx

human text scores
Mann Whitney U-test pvalue: 2.7249663862860052e-11
xxxxxxx

perspective scores
Mann Whitney U-test pvalue: 2.818086444959442e-10
xxxxxxx

bert scores
Mann Whitney U-test pvalue: 6.421075694326938e-09
-----------------------------

FOX PBS
xxxxxxx

human text scores
Mann Whitney U-test pvalue: 1.9903528258122023e-22
xxxxxxx

perspective scores
Mann Whitney U-test pvalue: 4.5386855861941757e-10
xxxxxxx

bert scores
Mann Whitney U-test pvalue: 8.935122696519494e-15
-----------------------------

MSNBC PBS
xxxxxxx

human text scores
Mann Whitney U-test pvalue: 0.00040087598191538166
xxxxxxx

perspective scores
Mann Whitney U-test pvalue: 0.2514202218568159
xxxxxxx

bert scores
Mann Whitney U-test pvalue: 0.0536937210300465
-----------------------------



## News Shows Transcript Data Analysis

In [None]:
import csv
import nltk
nltk.download('punkt')

segments_file = '/content/gdrive/My Drive/Backup/Research/Incivility/Annotations_Feb_March/Scores/transcript_level_analysis/segments_and_shows/all_segments_feb.csv'
output_file = '/content/gdrive/My Drive/Incivility/Perspective/Perspective_BERT/transcript_data_BERT.csv'
with open(segments_file) as f, open(output_file, 'w') as w:
    header = f.readline()
    reader = csv.reader(f)
    writer = csv.writer(w)
    for row in reader:
        text = row[0]
        info = [row[2], row[3], row[4]]
        words = nltk.word_tokenize(text)
        if len(words) > 256:
            sets = []
            c = 0
            sentences = nltk.sent_tokenize(text)
            for sentence in sentences:
                c += len(nltk.word_tokenize(sentence))
                sets.append(sentence)
                if c > 256:
                    t = ' '.join(sets)
                    s = get_score(t)
                    new_row = [t, s]
                    new_row.extend(info)
                    writer.writerow(new_row)
                    sets = [sentence]
                    c = len(nltk.word_tokenize(sentence))
                    
            t = ' '.join(sets)
            s = get_score(t)
            new_row = [t, s]
            new_row.extend(info)
            writer.writerow(new_row)    

        else:
            s = get_score(text)
            new_row = [text, s]
            new_row.extend(info)
            writer.writerow(new_row)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
'''
Script to compute statistical significance tests on the snippet level data (human and Perspective rated)
'''
import csv
from scipy.stats import ttest_ind, mannwhitneyu
import numpy as np
import itertools 

scores = {'FOX': [], 'MSNBC': [], 'PBS': []}

with open('/content/gdrive/My Drive/Incivility/Perspective/Perspective_BERT/transcript_data_BERT_counts.csv') as f:
	header = f.readline()
	reader = csv.reader(f)
	data = [row for row in reader]
	for row in data:
		data = np.array([float(row[2])])
		
		if 'FOX' in row[1]:
			scores['FOX'].append(data)
		if 'MSNBC' in row[1]:
			scores['MSNBC'].append(data)
		if 'PBS' in row[1]:
			scores['PBS'].append(data)

scores['FOX'] = np.array(scores['FOX'])
scores['MSNBC'] = np.array(scores['MSNBC'])
scores['PBS'] = np.array(scores['PBS'])

pair_list = list(itertools.combinations(list(scores.keys()), 2))

for show1, show2 in pair_list:
	print(show1, show2)

	# print('human video scores')
	# # Z, p = ttest_ind(scores[show1][:, 0], scores[show2][:, 0])
	# # print('T-test pvalue: '+str(p))

	# Z, p = mannwhitneyu(scores[show1][:, 0], scores[show2][:, 0])
	# print('Mann Whitney U-test pvalue: '+str(p))

	print('xxxxxxx\n')

	print('bert scores')
	# Z, p = ttest_ind(scores[show1][:, 3], scores[show2][:, 3])
	# print('T-test pvalue: '+str(p))

	Z, p = mannwhitneyu(scores[show1][:, 0], scores[show2][:, 0])
	print('Mann Whitney U-test pvalue: '+str(p))


	print('-----------------------------\n')


FOX MSNBC
xxxxxxx

bert scores
Mann Whitney U-test pvalue: 1.903007442283471e-05
-----------------------------

FOX PBS
xxxxxxx

bert scores
Mann Whitney U-test pvalue: 0.00018924520925346386
-----------------------------

MSNBC PBS
xxxxxxx

bert scores
Mann Whitney U-test pvalue: 0.14067705592605478
-----------------------------



## EACL Templates Analysis

In [None]:
import csv

templates = ['We wrote _____ on the page.','They whispered _____ to themselves. ','I erased the word _____ from the page.','You didn’t write _____.','Did he say _____ ?']

trigger_words = []
with open('/content/gdrive/My Drive/Backup/Research/Incivility/Annotations_Feb_March/Scores/transcript_level_analysis/words/words_analysis_clean.csv', encoding = "ISO-8859-1") as f:
    header = f.readline()
    reader = csv.reader(f)
    trigger_words = [line[0] for line in reader]

with open('/content/gdrive/My Drive/Incivility/Perspective/Perspective_Error_Analysis/error_trigger_densities/all_errors/Templates_BERT_EACL.csv', 'w') as w:
    writer = csv.writer(w)
    writer.writerow(['error trigger', templates[0], templates[1], templates[2], templates[3], templates[4]])
    for i, word in enumerate(trigger_words):
        t0 = templates[0].replace('_____', word)
        t1 = templates[1].replace('_____', word)
        t2 = templates[2].replace('_____', word)
        t3 = templates[3].replace('_____', word)
        t4 = templates[4].replace('_____', word)
        row = [word, get_score(t0), get_score(t1), get_score(t2), get_score(t3), get_score(t4)]
        writer.writerow(row)
        print(i+1, word)

1 0
2 1.375
3 3
4 4
5 5.7
6 10
7 11
8 17
9 18
10 19
11 20
12 24
13 26
14 37
15 50
16 55
17 70
18 73
19 90
20 157
21 200
22 224
23 427
24 500
25 1000
26 1973
27 1984
28 2004
29 2011
30 2012
31 2014
32 2018
33 2019
34 2020
35 5400
36 16501
37 25954
38 180000
39 -
40 --
41 ,
42 :
43 !
44 ?
45 ...
46 '
47 Ò
48 (
49 )
50 [
51 ]
52 &
53 ``
54 $
55 10:00:00
56 115-pound
57 13th
58 1700s
59 1800s
60 1860s
61 1970s
62 25th
63 39-year-old
64 5-foot-tall
65 6:30
66 64-year-old
67 A
68 a
69 a.m.
70 ability
71 able
72 abortion
73 About
74 about
75 above
76 Absolutely
77 absolutely
78 abuse
79 abused
80 abusively
81 abysmal
82 Academy
83 accept
84 accepted
85 access
86 According
87 according
88 accosted
89 account
90 accounts
91 accuracy
92 accused
93 accusing
94 achieve
95 acknowledge
96 acknowledged
97 Acosta
98 acquaintances
99 across
100 Act
101 acting
102 action
103 active
104 activity
105 Actor
106 actor
107 actually
108 added
109 addition
110 address
111 ADL
112 administration
113 admire
114 

## Adversarial (pt.2)

In [None]:
import json
import requests
import time
def get_score(text):
    api_key = 'AIzaSyDcHE3PPBdKxZFm2a075M_RuIvRL9-ATws'
    url = ('https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze' +    
                '?key=' + api_key)
    data_dict = {
        'comment': {'text': text},
        'languages': ['en'],
        'requestedAttributes': {'TOXICITY': {}}
    }
    response = requests.post(url=url, data=json.dumps(data_dict))
    # print(str(response.content))    
    response_dict = json.loads(response.content.decode('utf-8'))
    # print(text)
    # print(response_dict['attributeScores']['TOXICITY']['summaryScore']['value'])
    # print(json.dumps(response_dict, indent=2))
    return response_dict['attributeScores']['TOXICITY']['summaryScore']['value']

In [None]:
# Adversarial 1 - examples from article and bio - shared by Ani
import csv
with open('adversarial_5.csv') as f, open('adversarial_5_bert.csv', 'w') as w:
    header = f.readline()
    reader = csv.reader(f)
    writer = csv.writer(w)
    for row in reader:
        text = row[1]

        input = torch.tensor(convert_lines([text], MAX_SEQUENCE_LENGTH, tokenizer), dtype=torch.long)        
        pred = model(input.to(device), attention_mask=(input>0).to(device), labels=None)
        score = torch.sigmoid(pred).item()

        perspective = get_score(text)
        writer.writerow([row[0], text, perspective, score])
        time.sleep(2)