# Set Up

In [3]:
### basic imports, read in raw data file
import csv, statistics, random, os
import nltk
from nltk import agreement
from tqdm.notebook import tqdm
data = []

with open(os.path.abspath('../data/cold-all-1.0.tsv'),encoding='utf8') as infile:
    csvreader = csv.reader(infile, delimiter="\t")
    for row in csvreader:
        data.append(row)

In [4]:
### check the data/header line
print(data[:2])
print(len(data))

[['COLDID', 'Annotator', 'OriginalID', 'Text', 'Q1', 'Q2', 'Q3', 'Q4'], ['1', 'A', 'D-5', 'You’re black? You don’t sound like you’re black', 'Y', 'N', 'Y', 'N']]
7774


## Define Classes and Functions

In [13]:
class Tweet: 
    def __init__(self, lines):     
        self.lines = lines
        self.id = ''
        self.dataset = ''
        self.annos = []
        self.text = ''
        self.off = []
        self.slur = []
        self.adjNom = []
        self.dist = []
        self.offMaj = ''
        self.slurMaj = ''
        self.adjNomMaj = ''
        self.distMaj = ''
        self.cat = ''
        self.process()

    def process(self):
        #print(self.lines)
        for line in self.lines:
            if self.id != '':
                if line[2] != self.id:
                    print("Problem: ids don't match:", line[2], self.id)
                else: pass
            else: 
                self.id = line[2]
#REMOVE THIS CHECK BECAUSE SELF.ID INCLUDES DATASET
#             if self.dataset != '':
#                 if line[1] != self.dataset:
#                     print("Problem: datasets don't match:", line[1], self.dataset)
#                 else: pass
#             else: 
#                 self.dataset = line[1]
            #if self.text != '':  ### removed this check because it catches only
                                  ### trivial differences in the texts (punctuation, etc.)
            #    if line[3] != self.text:
            #        print("Problem: texts don't match:", line[3], self.text)
            #    else: pass
            #else: 
            self.text = line[3]
            self.annos.append(line[1])# Changed 2 to 1 because that is where the annotators are listed
            self.off.append(line[4])
            self.slur.append(line[5])
            self.adjNom.append(line[6])
            self.dist.append(line[7])

        self.annoSet = set(self.annos)
        if len(self.annoSet) != len(self.annos):
            print("Duplicate annotations:", self.id)
           # input("got it? ")
        else:
            print("No duplicates:", self.id)




    def getMajVotes(self):
        try:
            self.offMaj = statistics.mode(self.off)
        except:
            #self.offMaj = random.choice(self.off)
            print("no majority", self.id)
            self.offMaj = max(self.off) # if there's no majority, choose 'Y'
            #countTies += 1

        try:
            self.slurMaj = statistics.mode(self.slur)
        except:
            #self.slurMaj = random.choice(self.slur)
            print("no majority", self.id)
            self.slurMaj = max(self.slur)
            #countTies += 1

        try:
            self.adjNomMaj = statistics.mode(self.adjNom)
        except:
            #self.adjNomMaj = random.choice(self.adjNom)
            print("no majority", self.id)
            self.adjNomMaj = max(self.adjNom)
            #countTies += 1

        try:
            self.distMaj = statistics.mode(self.dist)
        except:
            #self.distMaj = random.choice(self.dist)
            print("no majority", self.id)
            self.distMaj = max(self.dist)
            #countTies += 1
    
    def sort(self):
        if self.offMaj == 'Y':
            if self.slurMaj == 'Y':
                #cats['offSlur'].append(self.id)
                self.cat = 'offSlur'
                return
            elif self.adjNomMaj == 'Y':
                if self.distMaj == 'Y':
                    #cats['offBoth'].append(self.id)
                    self.cat = 'offBoth'
                    return
                else:
                    #cats['offNom'].append(self.id)
                    self.cat = 'offNom'
                    return
            elif self.distMaj == 'Y':
                #cats['offDist'].append(self.id)
                self.cat = 'offDist'
                return
            else:
                #cats['offOther'].append(self.id)
                self.cat = 'offOther'
                return
        else:
            if self.slurMaj == 'Y':
                #cats['reclaimed'].append(self.id)
                self.cat = 'reclaimed'
                return
            elif self.adjNomMaj == 'Y':
                if self.distMaj == 'Y':
                    #cats['nonBoth'].append(self.id)
                    self.cat = 'nonBoth'
                    return
                else:
                    #cats['nonNom'].append(self.id)
                    self.cat = 'nonNom'
                    return
            elif self.distMaj == 'Y':
                #cats['nonDist'].append(self.id)
                self.cat = 'nonDist'
                return
            else:
                #cats['nonNone'].append(self.id)
                self.cat = 'nonNone'
                return



In [10]:
def fixID(line):
    #print(line)
    currID = line[2]
    currDataSet = line[1]
    if currID.startswith('D') or currID.startswith('M') or currID.startswith('C') or currID.startswith('WH'):
        pass
        #print("surprise!", currID)
        #input("okay? ")
    elif currDataSet:
        newID = currDataSet+'-'+currID
        line[2] = newID
    else:
        print("error:", currDataSet, currID)
        print(line)
        input("? ")
    return line

In [11]:
def selectThree(group):
    newGroup = []
    groupCopy = group
    print("starting selection on", group[0][2])
    #for line in group:
     #   anno = line[0]
      #  if anno == 'A' or anno == 'E' or anno == 'F':
            #print("selected:", anno)
      #      newGroup.append(line)
       #     groupCopy.remove(line)

        #else: pass
    while len(newGroup) < 3:
        selected = random.choice(groupCopy)
        #print("selected (rand):", selected[0])
        newGroup.append(selected)
        groupCopy.remove(selected)
    #input("selection finished - look okay? ")
    if len(newGroup) == 3:
        return newGroup
    else:
        print("problem:", len(newGroup))

        



# Process Tweets

In [16]:
cats = {'offSlur': [], 'offNom': [], 'offDist': [], 'offBoth': [], 'offOther': [], 'reclaimed': [], 'nonNom': [], 'nonDist': [], 'nonBoth': [], 'nonNone': []}
ties = []

check_annos = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[], 8:[],9:[],10:[],11:[],12:[]} #expand this because duplicates are still in
tweets3 = []
tweets2or1 = []
tweets4orMore = []
prevID = ''
thisID = ''
group = []
print(len(data))
for line in tqdm(data[1:]):
    if len(line) != 8: #changed to 8 because there's 8 feautres instead of 9 
        print(line)
        input("okay? ")
    line = fixID(line)
    #print(line)
    #print(prevID, thisID)
    #print(len(group))
    #input("okay? ")
    thisID = line[2]
    if thisID == prevID:
        group.append(line)
    elif prevID == '':
        group.append(line)
    elif thisID == 'ID':
        pass
    else:
        num = len(group)
        if num == 3:       ### process tweets with 3 annos
            check_annos[3].append(prevID)
            tweets3.append(Tweet(group))
        elif num == 2 or num == 1:  ### process tweets with 1 or 2 annos (will be removed)
            check_annos[num].append(prevID)
            tweets2or1.append(Tweet(group))
        else:         #### process tweets with more than 3 annos (3 annos will be selected, added to set of tweets with 3 annos)
            newGroup = selectThree(group)
            check_annos[num].append(prevID)
            tweets3.append(Tweet(newGroup))
            tweets4orMore.append(Tweet(group))
            newGroup = []
        group = [line]
    prevID = thisID

7774


  0%|          | 0/7773 [00:00<?, ?it/s]

No duplicates: D-5
No duplicates: D-6
starting selection on M-6
No duplicates: M-6
No duplicates: M-6
No duplicates: D-13
No duplicates: M-12a
No duplicates: M-12b
No duplicates: M-12c
No duplicates: D-13
starting selection on M-13a
No duplicates: M-13a
No duplicates: M-13a
No duplicates: M-13c
No duplicates: M-14a
No duplicates: M-14b
No duplicates: D-15
No duplicates: M-15a
starting selection on M-15b
No duplicates: M-15b
No duplicates: M-15b
No duplicates: M-15c
No duplicates: M-16a
No duplicates: M-16b
starting selection on M-17a
No duplicates: M-17a
No duplicates: M-17a
No duplicates: M-17b
No duplicates: M-17c
No duplicates: M-17d
No duplicates: M-18a
No duplicates: M-18b
No duplicates: M-19a
starting selection on M-19b
No duplicates: M-19b
No duplicates: M-19b
No duplicates: M-20a
No duplicates: M-20b
No duplicates: M-21a
No duplicates: M-21b
No duplicates: M-22
No duplicates: M-23
No duplicates: M-24
No duplicates: M-25a
No duplicates: M-25b
No duplicates: M-26a
No duplicates: 

No duplicates: D-859
No duplicates: D-860
No duplicates: D-864
No duplicates: D-867
No duplicates: D-868
No duplicates: D-869
No duplicates: D-872
No duplicates: D-875
No duplicates: D-925
No duplicates: D-929
No duplicates: D-931
No duplicates: D-933
No duplicates: D-935
No duplicates: D-938
No duplicates: D-945
No duplicates: D-953
No duplicates: D-956
starting selection on D-958
No duplicates: D-958
No duplicates: D-958
No duplicates: D-980
No duplicates: D-988
No duplicates: M-995
No duplicates: M-997
No duplicates: D-998
No duplicates: M-998
No duplicates: D-999
No duplicates: D-1000
starting selection on M-1000
No duplicates: M-1000
No duplicates: M-1000
starting selection on M-1001
No duplicates: M-1001
Duplicate annotations: M-1001
starting selection on M-1002
No duplicates: M-1002
No duplicates: M-1002
starting selection on M-1003
Duplicate annotations: M-1003
No duplicates: M-1003
starting selection on M-1004
No duplicates: M-1004
No duplicates: M-1004
starting selection on M

KeyError: 12

In [None]:
for x in check_annos.keys():
    print(x, len(check_annos[x]))


1 2
2 127
3 1830
4 49
5 91
6 65
7 0


In [None]:
print(len(tweets3), len(tweets2or1), len(tweets4orMore))

2035 129 205


In [None]:
for t in tweets3:
    t.getMajVotes()
    t.sort()
    cats[t.cat].append(t.id)

In [None]:
for c in cats.keys():
    print(c, len(cats[c]))

offSlur 626
offNom 202
offDist 20
offBoth 33
offOther 78
reclaimed 399
nonNom 143
nonDist 8
nonBoth 0
nonNone 526


# Create Files

## Create majority vote file

In [None]:
outfile = "allAnnos_majVote_3per.csv" ### new version, selecting Y when tied
fields = ['ID','DataSet','Text','Off','Slur','Nom','Dist','Cat']
rows = []

for t in tweets3:
    num = len(t.annos)
    thisRow = [t.id, t.dataset, t.text, t.offMaj, t.slurMaj, t.adjNomMaj, t.distMaj, t.cat]

    for x in thisRow[3:7]:
        if x != 'Y' and x != 'N':
            print(x)
            print(thisRow)
            input("okay?")
        else: 
            pass
            
    rows.append(thisRow)

In [None]:
print(len(rows))
print(rows[:5])

2035
[['D-5', 'D', 'You’re black? You don’t sound like you’re black', 'Y', 'N', 'N', 'N', 'offOther'], ['D-6', 'D', 'you’re pretty for a black girl', 'Y', 'N', 'N', 'Y', 'offDist'], ['M-6', 'M', 'eating like A Poor is chic & hilarious bc we are Not Poor & smugly show it', 'Y', 'N', 'Y', 'Y', 'offBoth'], ['C-11', 'C', "I don't think of you as a Mexican", 'Y', 'N', 'N', 'Y', 'offDist'], ['D-13', 'D', '" bad bitches is the only thing that i like "', 'N', 'Y', 'Y', 'N', 'reclaimed']]


In [None]:
### create file: majority vote annotations for *all* tweets
with open(outfile, 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(rows)

## Create agreement files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: ignored

In [None]:
root = "./output"


file3rate = root+"/allAnnos_3ratings_3per.csv"
rows3 = []

fields_base = ["ID", "DataSet", "Text"]
#fields1 = fields_base + ["Off1", "Slur1", "Nom1", "Dist1"]
#fields2 = fields_base + ["Off1", "Off2", "Slur1", "Slur2", "Nom1", "Nom2", "Dist1", "Dist2"]
fields3 = fields_base + ["Off1", "Off2", "Off3", "Slur1", "Slur2", "Slur3","Nom1", "Nom2","Nom3", "Dist1", "Dist2","Dist3"]
#fields4 = fields_base + ["Off1", "Off2", "Off3", "Off4","Slur1", "Slur2", "Slur3","Slur4","Nom1", "Nom2","Nom3", "Nom4","Dist1", "Dist2","Dist3","Dist4"]
#fields5 = fields_base + ["Off1", "Off2", "Off3", "Off4","Off5","Slur1", "Slur2", "Slur3","Slur4","Slur5","Nom1", "Nom2","Nom3", "Nom4","Nom5","Dist1", "Dist2","Dist3","Dist4","Dist5"]
#fields6 = fields_base + ["Off1", "Off2", "Off3", "Off4","Off5","Off6","Slur1", "Slur2", "Slur3","Slur4","Slur5","Slur6","Nom1", "Nom2","Nom3", "Nom4","Nom5","Nom6","Dist1", "Dist2","Dist3","Dist4","Dist5","Dist6"]


In [None]:
for t in tweets3:
    ratings = len(t.annos)
    if ratings == 3:
        this = [t.id, t.dataset, t.text] + [o for o in t.off] + [s for s in t.slur] + [a for a in t.adjNom] + [d for d in t.dist]
        rows3.append(this)
    else:
        print("error")

In [None]:
with open(file3rate,'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields1)
    csvwriter.writerows(rows1)




FileNotFoundError: ignored

In [None]:
with open(file2rate,'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields2)
    csvwriter.writerows(rows2)



NameError: ignored

In [None]:
with open(file3rate,'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields3)
    csvwriter.writerows(rows3)

FileNotFoundError: ignored

In [None]:
with open(file4rate,'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields4)
    csvwriter.writerows(rows4)

NameError: ignored

In [None]:
with open(file5rate,'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields5)
    csvwriter.writerows(rows5)

NameError: ignored

In [None]:
with open(file6rate,'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields6)
    csvwriter.writerows(rows6)

NameError: ignored

### Sort tweets into categories based on annotations

In [None]:
gold = []
with open("allAnnos_majVote_3per.csv") as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        gold.append(row)

FileNotFoundError: ignored

In [None]:
def getCat(line):
    cat = ''
    offMaj = line[3]
    slurMaj = line[4]
    adjNomMaj = line[5]
    distMaj = line[6]

    if offMaj == 'Y':
        if slurMaj == 'Y':
            cat = 'offSlur'
        elif adjNomMaj == 'Y':
            if distMaj == 'Y':
                cat = 'offBoth'
            else:
                cat = 'offNom'
        elif distMaj == 'Y':
            cat = 'offDist'
        else:
            cat = 'offOther'
    
    else:
        if slurMaj == 'Y':
            cat = 'reclaimed'
        elif adjNomMaj == 'Y':
            if distMaj == 'Y':
                cat = 'nonBoth'
            else:
                cat = 'nonNom'
        elif distMaj == 'Y':
            cat = 'nonDist'
        else:
            cat = 'nonNone'
    line.append(cat)
    return(line)



In [None]:
#newgold = []
#for line in gold[1:]:
#    newline = getCat(line)
#    newgold.append(newline)

newgold = [getCat(line) for line in gold[1:]]

In [None]:
print(len(newgold))

2035


In [None]:
fields = ['ID','DataSet','Text','Off','Slur','Nom','Dist','Cat']
with open('allAnnos_majVote_forJordan_withCats.csv','w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(newgold)

### A new approach to agreement

In [None]:
##  this = [t.id, t.dataset, t.text] + [o for o in t.off] + [s for s in t.slur] + [a for a in t.adjNom] + [d for d in t.dist]

for r in [(rows3,3)]:
    raters = r[1]
    allOff = []
    allSlur = []
    allNom = []
    allDist = []
    for line in r[0]:
        id = line[0]
        allOff += [[str(i), id, line[3+i]] for i in range(raters)]
        allSlur += [[str(i), id, line[4+i]] for i in range(raters)]
        allNom += [[str(i), id, line[5+i]] for i in range(raters)]
        allDist += [[str(i), id, line[6+i]] for i in range(raters)]
    offTask = agreement.AnnotationTask(data=allOff)
    slurTask = agreement.AnnotationTask(data=allSlur)
    nomTask = agreement.AnnotationTask(data=allNom)
    distTask = agreement.AnnotationTask(data=allDist)

    print()
    print("Agreement for tweets with "+str(raters)+" ratings (num="+str(len(r[0]))+"):")
    print("\tFleiss for Off: "+str(offTask.multi_kappa())+"\tAlpha for Off: "+str(offTask.alpha()))
    print("\tFleiss for Slur: "+str(slurTask.multi_kappa())+"\tAlpha for Slur: "+str(slurTask.alpha()))
    print("\tFleiss for AdjNom: "+str(nomTask.multi_kappa())+"\tAlpha for AdjNom: "+str(nomTask.alpha()))
    print("\tFleiss for Dist: "+str(distTask.multi_kappa())+"\tAlpha for Dist: "+str(distTask.alpha()))
    print()
    input("Ready for the next set? ")

#for t in tweets:
 #   ratings = len(t.annos)
 #   if ratings == 1:
 #       pass
 #   elif ratings == 2:
 #       allOff2 = []
 #   offData = [[str(i), t.id, t.off[i]] for i in range(ratings)]   
 #   slurData = [[str(i), t.id, t.slur[i]] for i in range(ratings)]
 #   nomData = [[str(i), t.id, t.adjNom[i]] for i in range(ratings)]
 #   distData = [[t.annos[i], t.id, t.dist[i]] for i in range(ratings)]
 #   if ratings == 3:
 #       allOffData3 += offData
 #       allSlurData3 += slurData
 #       allNomData3 += nomData
 #       allDistData3 += distData
 #   else: pass

#offTask = agreement.AnnotationTask(data=allOffData)
#print("fleiss "+ str(offTask.multi_kappa()))




Agreement for tweets with 3 ratings (num=2035):
	Fleiss for Off: 0.612176808641872	Alpha for Off: 0.6115894747296602
	Fleiss for Slur: 0.3804445354146634	Alpha for Slur: 0.38006931940131605
	Fleiss for AdjNom: 0.4332441798279971	Alpha for AdjNom: 0.4330331965873494
	Fleiss for Dist: 0.7530730090496497	Alpha for Dist: 0.7529749285237823



KeyboardInterrupt: ignored

### Analysis of classification results

In [None]:
deiol = []
results = []

with open('deiol_allAnnos_majVote_3per.csv') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        deiol.append(row)

with open('results_majVote_dec28_withCats.csv') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        results.append(row)

print(len(deiol))
print(len(results))

2036
2159


In [None]:
idsD = []
idsR = []

for x in deiol[1:]:
    idsD.append(x[0])
for x in results[1:]:
    idsR.append(x[0])

print(len(idsD), len(idsR))

rNotD = [x for x in idsR if x not in idsD]
dNotR = [x for x in idsD if x not in idsR]

print(len(rNotD), len(dNotR))

2035 2158
131 19


In [None]:
wanted = set(idsD) & set(idsR)
print(len(wanted))

2016


In [None]:
wantedL = list(wanted)
print(len(wantedL))

2016


In [None]:
dresults = []
ids = []
for r in results:
    id = r[0]
    if id in wantedL and id not in ids:
        dresults.append(r)
        ids.append(id)
print(len(dresults))

2016


In [None]:
with open('results_deiol_majVote.csv','w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(deiol[0])
    csvwriter.writerows(dresults)

In [None]:
import pandas as pd
df = pd.read_csv("test_results.tsv", sep="\t", header=None)
tf = pd.read_csv("pejorative.csv")

In [None]:
olid_2020_df = pd.concat([tf["ID"], df.idxmax(axis=1)], axis=1)
olid_2020_df[0] = olid_2020_df[0].apply(lambda x: "OFF" if bool(x) else "NOT")

for i in range(len(dresults)):
  ind = dresults[i][0]
  try:
    olid_2020_result = olid_2020_df[olid_2020_df["ID"] == ind][0].values[0]
    dresults[i].append(olid_2020_result)
  except:
    olid_2020_result = olid_2020_df[olid_2020_df["ID"] == ind[:-1]][0].values[0]
    dresults[i].append(olid_2020_result)
    pass

### And now some analysis!!!

In [None]:
### our main file is dresults

hasoc = []
hasoc_binary = []
olid = []
jigsaw = []
olid_2020 = []
### build lists of tuples, where each consists of (category, modelResult)

In [None]:
# print(dresults[0])
# print(len(dresults))
ind

'D-6561'

In [None]:
for line in dresults:
    cat = line[7]
    hasoc.append((cat, line[9]))
    hasoc_binary.append((cat, line[10]))
    olid.append((cat, line[11]))
    jigsaw.append((cat, line[12]))
    olid_2020.append((cat, line[13]))



In [None]:
olid[0:10]

[('offOther', 'HOF'),
 ('offDist', 'NOT'),
 ('offNom', 'NOT'),
 ('offDist', 'NOT'),
 ('reclaimed', 'HOF'),
 ('offDist', 'NOT'),
 ('nonNom', 'NOT'),
 ('nonNom', 'NOT'),
 ('nonNom', 'NOT'),
 ('offNom', 'HOF')]

In [None]:
olid_2020[:10]

[('offOther', 'OFF'),
 ('offDist', 'OFF'),
 ('offNom', 'OFF'),
 ('offDist', 'NOT'),
 ('reclaimed', 'OFF'),
 ('offDist', 'OFF'),
 ('nonNom', 'NOT'),
 ('nonNom', 'NOT'),
 ('nonNom', 'NOT'),
 ('offNom', 'OFF')]

In [None]:
olid[26]
print(len(hasoc), len(hasoc_binary), len(olid), len(jigsaw))

2016 2016 2016 2016


In [None]:
def getResultsBinary(pairs, dicto):
    counts = 0
    for p in pairs:
        cat = p[0]
        label = p[1]
        if label == 'NOT':
            dicto[cat][1] += 1
            counts += 1
        elif label == 'HOF' or label == 'OFF' or label == 'TOX':
            dicto[cat][0] += 1
            counts += 1
        else:
            print("label error:", label)
            input("okay? ")
    print(counts)
    print(dicto)

In [None]:
def getResultsHasoc(pairs, dicto):
    counts = 0
    for p in pairs:
        cat = p[0]
        label = p[1]
        #print(cat, label)
        if label == 'HATE':
            dicto[cat][0] += 1
            counts += 1
        elif label ==  'OFFN' or label == 'OFF':
            dicto[cat][1] += 1
            counts += 1
        elif label == 'PRFN':
            dicto[cat][2] += 1
            counts += 1
        elif label == 'NOT':
            dicto[cat][3] += 1
            counts += 1
        else:
            print("label error:", label)
            input("okay? ")
    print(counts)
    print(dicto)

In [None]:
### offSlur, offNom, offDist, offBoth, offNone: hasoc-HATE/OFFN/PRFN, hasoc_binary-HOF, olid-HOF, jigsaw-TOX
### reclaimed, nonNom, nonDist, nonBoth, nonNone: hasoc-NOT, hasoc_binary-NOT, olid-NOT, jigsaw-NOT
### binary dictionaries - [OFF, NOT]
### 4part dictionary - [HATE, OFFN, PRFN, NOT]

hasocDict = {}
hasoc_binaryDict = {}
olidDict = {}
jigsawDict = {}
olid_2020_Dict = {}

for k in cats.keys():
    hasocDict[k] = [0,0,0,0]
    hasoc_binaryDict[k] = [0,0]
    olidDict[k] = [0,0]
    jigsawDict[k] = [0,0]
    olid_2020_Dict[k] = [0,0]

In [None]:
hasocDict

{'nonBoth': [0, 0, 0, 0],
 'nonDist': [1, 0, 0, 5],
 'nonNom': [9, 8, 10, 114],
 'nonNone': [20, 18, 24, 463],
 'offBoth': [5, 4, 3, 19],
 'offDist': [6, 5, 2, 6],
 'offNom': [27, 19, 25, 130],
 'offOther': [12, 23, 12, 34],
 'offSlur': [30, 129, 287, 174],
 'reclaimed': [3, 14, 253, 122]}

In [None]:
print("running full hasoc")
getResultsHasoc(hasoc, hasocDict)
print("running hasoc binary")
getResultsBinary(hasoc_binary, hasoc_binaryDict)
print("running olid")
getResultsBinary(olid, olidDict)
print("running jigsaw")
getResultsBinary(jigsaw, jigsawDict)
print("running olid_2020")
getResultsBinary(olid, olid_2020_Dict)

running full hasoc
2016
{'offSlur': [30, 129, 287, 174], 'offNom': [27, 19, 25, 130], 'offDist': [6, 5, 2, 6], 'offBoth': [5, 4, 3, 19], 'offOther': [12, 23, 12, 34], 'reclaimed': [3, 14, 253, 122], 'nonNom': [9, 8, 10, 114], 'nonDist': [1, 0, 0, 5], 'nonBoth': [0, 0, 0, 0], 'nonNone': [20, 18, 24, 463]}
running hasoc binary
2016
{'offSlur': [480, 140], 'offNom': [81, 120], 'offDist': [13, 6], 'offBoth': [14, 17], 'offOther': [60, 21], 'reclaimed': [308, 84], 'nonNom': [48, 93], 'nonDist': [2, 4], 'nonBoth': [0, 0], 'nonNone': [117, 408]}
running olid
2016
{'offSlur': [529, 91], 'offNom': [128, 73], 'offDist': [13, 6], 'offBoth': [23, 8], 'offOther': [64, 17], 'reclaimed': [310, 82], 'nonNom': [61, 80], 'nonDist': [4, 2], 'nonBoth': [0, 0], 'nonNone': [142, 383]}
running jigsaw
2016
{'offSlur': [547, 73], 'offNom': [80, 121], 'offDist': [12, 7], 'offBoth': [7, 24], 'offOther': [62, 19], 'reclaimed': [337, 55], 'nonNom': [40, 101], 'nonDist': [3, 3], 'nonBoth': [0, 0], 'nonNone': [104, 

In [None]:
def printPerc(dicto):
    for k in dicto.keys():
        print(k)
        total = sum(dicto[k])
        if total == 0:
            print("no instances")
            print()
            continue
        for n in dicto[k]:
            print(n/total, sep="\t")
        print()

In [None]:
printPerc(hasocDict)

offSlur
0.04838709677419355
0.20806451612903226
0.4629032258064516
0.2806451612903226

offNom
0.13432835820895522
0.0945273631840796
0.12437810945273632
0.6467661691542289

offDist
0.3157894736842105
0.2631578947368421
0.10526315789473684
0.3157894736842105

offBoth
0.16129032258064516
0.12903225806451613
0.0967741935483871
0.6129032258064516

offOther
0.14814814814814814
0.2839506172839506
0.14814814814814814
0.41975308641975306

reclaimed
0.007653061224489796
0.03571428571428571
0.6454081632653061
0.3112244897959184

nonNom
0.06382978723404255
0.05673758865248227
0.07092198581560284
0.8085106382978723

nonDist
0.16666666666666666
0.0
0.0
0.8333333333333334

nonBoth
no instances

nonNone
0.0380952380952381
0.03428571428571429
0.045714285714285714
0.8819047619047619



In [None]:
printPerc(hasoc_binaryDict)

offSlur
0.7741935483870968
0.22580645161290322

offNom
0.40298507462686567
0.5970149253731343

offDist
0.6842105263157895
0.3157894736842105

offBoth
0.45161290322580644
0.5483870967741935

offOther
0.7407407407407407
0.25925925925925924

reclaimed
0.7857142857142857
0.21428571428571427

nonNom
0.3404255319148936
0.6595744680851063

nonDist
0.3333333333333333
0.6666666666666666

nonBoth
no instances

nonNone
0.22285714285714286
0.7771428571428571



In [None]:
printPerc(olidDict)

offSlur
0.853225806451613
0.14677419354838708

offNom
0.6368159203980099
0.36318407960199006

offDist
0.6842105263157895
0.3157894736842105

offBoth
0.7419354838709677
0.25806451612903225

offOther
0.7901234567901234
0.20987654320987653

reclaimed
0.7908163265306123
0.20918367346938777

nonNom
0.4326241134751773
0.5673758865248227

nonDist
0.6666666666666666
0.3333333333333333

nonBoth
no instances

nonNone
0.2704761904761905
0.7295238095238096



In [None]:
printPerc(jigsawDict)

offSlur
0.882258064516129
0.11774193548387096

offNom
0.39800995024875624
0.6019900497512438

offDist
0.631578947368421
0.3684210526315789

offBoth
0.22580645161290322
0.7741935483870968

offOther
0.7654320987654321
0.2345679012345679

reclaimed
0.8596938775510204
0.14030612244897958

nonNom
0.28368794326241137
0.7163120567375887

nonDist
0.5
0.5

nonBoth
no instances

nonNone
0.1980952380952381
0.8019047619047619



In [None]:
printPerc(olid_2020_Dict)

offSlur
0.8709677419354839
0.12903225806451613

offNom
0.5920398009950248
0.4079601990049751

offDist
0.7894736842105263
0.21052631578947367

offBoth
0.7096774193548387
0.2903225806451613

offOther
0.8024691358024691
0.19753086419753085

reclaimed
0.826530612244898
0.17346938775510204

nonNom
0.3546099290780142
0.6453900709219859

nonDist
0.6666666666666666
0.3333333333333333

nonBoth
no instances

nonNone
0.2819047619047619
0.7180952380952381

