In [20]:
import pandas as pd
import numpy as np

## generate labels from tweet data

### full labels:
* 0: attended no conferences
* 1: attended anthrocon
* 2: attended comiccon2017
* 3: attended icann2016
* 4: attended anthrocon, and comiccon2017
* 5: attended anthrocon, and icann2016
* 6: attended comiccon2017, and icann2016
* 7: attended anthrocon, comiccon2017, and icann2016

### binary labels:
* 0: did not attend comiccon2017
* 1: did attended comiccon2017

In [21]:
# load data
path = "data/tweets/"

tweets1 = pd.read_csv(path+"anthrocon_tweets.csv", delimiter=',')
tweets2 = pd.read_csv(path+"comiccon2017_tweets.csv", delimiter=',')
tweets3 = pd.read_csv(path+"icann2016_tweets.csv", delimiter=',')


In [22]:
display(tweets1.head())
display(tweets2.head())
display(tweets3.head())

Unnamed: 0,event,tweet_id,node_id,tweet,attendance_label
0,anthrocon,1017097806786580480,601,anthrocon AC2018,1
1,anthrocon,1015974586138222594,621,“Furries descend upon Pittsburgh for Anthrocon...,1
2,anthrocon,1039835765566566400,768,Things are complicated right now. I'm not sure...,1
3,anthrocon,1016118419912036353,702,AC_MMXVIII pic.twitter.com/6zRIR13ko5,1
4,anthrocon,1016531045506781184,157,Anthrocon Anthrocon2018 AC2018 pic.twitter.com...,1


Unnamed: 0,event,tweet_id,node_id,tweet,attendance_label
0,comiccon2017,891241828477001728,2192,"Woohoo, going to comiccon2017 in London later ...",1
1,comiccon2017,924482142146572293,883,With catwoman catwomen lacomiccon2017 lacomicc...,1
2,comiccon2017,888840194308362240,1123,Why I'm not at comiccon2017 ryangosling pic.tw...,1
3,comiccon2017,889340577367678980,132,Which one of these things doesn't belong? Blac...,1
4,comiccon2017,888801595990736896,1959,Where's Wally?! KeiynanLonsdale comiccon2017 p...,1


Unnamed: 0,event,tweet_id,node_id,tweet,attendance_label
0,icann572016,989698109449146370,210,.ICANN in a deadlock with GDPR over the WHOIS ...,1
1,icann572016,990890732151025664,105,RT DrivingTheDay : ICANN Webinar Tuesday MAY 1...,1
2,icann572016,991400783508733952,630,"UNCSTD WSIS session, njhickson of ICANN : ther...",1
3,icann572016,992730328119865345,25,"UNCSTD WSIS session, njhickson of ICANN : the ...",1
4,icann572016,993078692829126656,25,"UNCSTD WSIS session, Canada: WGEC improved mut...",1


In [23]:
display(tweets1.describe())
display(tweets2.describe())
display(tweets3.describe())

Unnamed: 0,tweet_id,node_id,attendance_label
count,3842.0,3842.0,3842.0
mean,1.02567e+18,594.36127,0.686101
std,1.517368e+16,292.637332,0.464136
min,1.014305e+18,0.0,0.0
25%,1.015596e+18,327.0,0.0
50%,1.016819e+18,643.0,1.0
75%,1.032741e+18,790.0,1.0
max,1.068394e+18,1059.0,1.0


Unnamed: 0,tweet_id,node_id,attendance_label
count,4381.0,4381.0,4381.0
mean,8.952836e+17,1237.376855,0.757818
std,1.346167e+16,717.575998,0.428453
min,8.888001e+17,0.0,0.0
25%,8.89e+17,593.0,1.0
50%,8.895079e+17,1252.0,1.0
75%,8.929518e+17,1858.0,1.0
max,1.056351e+18,2494.0,1.0


Unnamed: 0,tweet_id,node_id,attendance_label
count,2550.0,2550.0,2550.0
mean,1.031794e+18,520.085882,0.301176
std,2.278375e+16,295.359141,0.458859
min,9.896981e+17,0.0,0.0
25%,1.011585e+18,248.0,0.0
50%,1.034929e+18,540.5,0.0
75%,1.05235e+18,781.0,1.0
max,1.068184e+18,1015.0,1.0


In [24]:
def get_attendees(tweets):
    return tweets.loc[tweets['attendance_label'] == 1]['node_id']


In [25]:
nodes1 = get_attendees(tweets1).to_numpy()
nodes2 = get_attendees(tweets2).to_numpy()
nodes3 = get_attendees(tweets3).to_numpy()


In [26]:
all_nodes = np.concatenate([nodes1, nodes2, nodes3])

node_min = all_nodes.min()
node_max = all_nodes.max()
print("node_min: ", node_min)
print("node_max: ", node_max)

node_min:  0
node_max:  2494


### full labels
* 0: attended no conferences
* 1: attended anthrocon
* 2: attended comiccon2017
* 3: attended icann2016
* 4: attended anthrocon, and comiccon2017
* 5: attended anthrocon, and icann2016
* 6: attended comiccon2017, and icann2016
* 7: attended anthrocon, comiccon2017, and icann2016

In [27]:
# generate full labels

nodes_full = []
for i in range(node_min, node_max + 1):
    a = (i in nodes1, i in nodes2, i in nodes3)
    
    if a == (1, 1, 1): label = 7
    elif a == (0, 1, 1): label = 6
    elif a == (1, 0, 1): label = 5
    elif a == (1, 1, 0): label = 4
    elif a == (0, 0, 1): label = 3
    elif a == (0, 1, 0): label = 2
    elif a == (1, 0, 0): label = 1
    else: label = 0
    
    nodes_full.append([i, label])

nodes_full = np.array(nodes_full)

In [28]:
# full labels
np.unique(nodes_full[:,1], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([ 451,   68, 1494,   50,  172,   24,  149,   87]))

In [29]:
# save full labels
np.savetxt('twitter_full.labels', nodes_full, delimiter='\t', fmt=['%i', '%i'])


### binary labels:
* 0: did not attend comiccon2017
* 1: did attended comiccon2017

In [30]:
# generate binary labels

nodes_binary = []
for i in range(node_min, node_max + 1):
    a = (i in nodes1, i in nodes2, i in nodes3)
    
    if a == (1, 1, 1): label = 1
    elif a == (0, 1, 1): label = 1
    elif a == (1, 0, 1): label = 0
    elif a == (1, 1, 0): label = 1
    elif a == (0, 0, 1): label = 0
    elif a == (0, 1, 0): label = 1
    elif a == (1, 0, 0): label = 0
    else: label = 0
    
    nodes_binary.append([i, label])

nodes_binary = np.array(nodes_binary).astype(int)
nodes_binary[:,1]

array([1, 1, 1, ..., 1, 1, 1])

In [31]:
# binary labels
np.unique(nodes_binary[:,1], return_counts=True)

(array([0, 1]), array([ 593, 1902]))

In [32]:
# save binary labels
np.savetxt('twitter_binary.labels', nodes_binary, delimiter='\t', fmt=['%i', '%i'])
