# Label Combining Notebook
This notebook is used to aggregate labels we have acquired, creating `csv` files that will later be used for training.

In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import FileLink, FileLinks

## 1. Combining Member Votes

In this section, we create a member usage csv. We first combine our labels together into an aggregated DataFrame called `combined`.

In [19]:
# files should include only first names of our team members
files = ['alan', 'wish', 'nidhi']
files = [f + '_labels.csv' for f in files]

# read and append each table, assuming each entry is correctly written.
combined = pd.read_csv(files[0], names=['name', 'user', 'time', 'label'])
for file in files[1:]:
    combined = combined.append(pd.read_csv(file, names=['name', 'user', 'time', 'label']))
combined

Unnamed: 0,name,user,time,label
0,alan,jize12341234,2020-10-23 09:59:15.983797,bot
1,alan,jdkxmfe4nlnf,2020-10-23 09:59:19.198504,bot
2,alan,xvtlutpplkjc,2020-10-23 09:59:55.232620,bot
3,alan,lpqy1gmkj1gm,2020-10-23 09:59:58.821355,bot
4,alan,msxkq3syrpvm,2020-10-23 10:00:01.195864,bot
...,...,...,...,...
220,nidhi,4kacdb4cbsfx,2020-10-31 17:25:10.744070,bot
221,nidhi,beauterrence,2020-10-31 17:25:14.568657,bot
222,nidhi,bostonshrimp,2020-10-31 17:25:18.726917,human
223,nidhi,jaylannathan,2020-10-31 17:25:22.324606,bot


### Testing for Majority Votes 


In [20]:
from combiner import majority_vote

# testing majority vote function
assert 'tied' == majority_vote(['bot', 'human']) 
assert 'human' == majority_vote(['human', 'human'])
assert 'bot' == majority_vote(['bot', 'bot', 'human'])
assert 'human' == majority_vote(['human', 'bot', 'human'])
assert 'human' == majority_vote(['human'])
assert 'bot' == majority_vote(['bot'])

# aggregating by user labels
csv_name = 'team_labels.csv'
final = combined[['user', 'label']].groupby('user').agg(majority_vote)
final['name'] = final.index
final = final[['name', 'label']]

# we drop any tied entry before writing into our final csv
final = final[final['label'] != 'tied']
final.to_csv(csv_name, index=False, header=False)
FileLink(csv_name)

## 2. Converting Researched Labels & Filtered Labels
In this section, we generate research labels by dropping the entries which don't pertain to our data set

In [26]:
from combiner import convert_label

# these are the research labels we acquired from previous datasets
research_labels = pickle.load(open("groundTruth.pkl", "rb"))
research_labels['reliable'] = research_labels['label'].apply(convert_label)
research_labels = research_labels[['name', 'reliable']]
research_labels

Unnamed: 0,name,reliable
0,linhuoyaneos,bot
1,g44dambxhage,bot
2,gm2dkmjtguge,bot
3,haydsnbugage,bot
4,gnb4pxctxnes,human
...,...,...
293765,pbufsvyruirs,human
293766,haytaobwgmge,bot
293767,qu2hp3mx4mte,bot
293768,gm3tenbwgqge,bot


In [30]:
temporal = pd.read_csv('temporal-only.csv', names=['user', 'time', 'weekday', 'hour'])
temporal.head()

array(['nmslnmslnmsl', 'justiceariel', 'edgarwinston', ...,
       'endlessdivdn', 'endlessoptex', 'innocentuser'], dtype=object)

In [33]:
researches = research_labels[research_labels['name'].isin(all_labels)].sort_values('reliable')
researches
obvious_bots = pd.read_csv('https://raw.githubusercontent.com/griffinbaum22/bot_detection/main/obvious_bots'
                          ).obvious_bots.values
reliables = researches.copy()
for bot in obvious_bots:
    reliables = reliables.append({'name': bot, 'reliable': 'bot'}, ignore_index=True)
reliables

Unnamed: 0,name,reliable
0,gu3dimzvguge,bot
1,g42tgmbshege,bot
2,jixiango3133,bot
3,guydknzsgege,bot
4,gyztqnjwg4ge,bot
...,...,...
1178,afd4ctrpcer1,bot
1179,beirwpuugtkm,bot
1180,ycdzhtpheges,bot
1181,4lp3gnxrbfyc,bot


In [34]:
reliables.to_csv("reliable-labels.csv", index=False, header=False)
FileLink("reliable-labels.csv")