**As a pre-step, we convert the PDF-formatted dataset to the EXCEL-formatted one.**

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import numpy as np
import pandas as pd
import networkx as nx

# Data Preprocessing

## Node

In [6]:
node_skiprows = list(range(1, 5))

In [8]:
# Import Node
node_df = pd.read_excel('Original_Dataset/Vaccination_data.xlsx').iloc[:1326]; 
node_df.head();

In [9]:
# Drop Extra Columns
node_col_drop = ['Unnamed: 1', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 10', 'Unnamed: 11']
for col_drop in node_col_drop:
    node_df = node_df.drop(col_drop, 1)
node_df.head()

Unnamed: 0,selector,polarity,x,y,banner_cdc,fan_count,degree,in_degree
0,a_001,anti_vaccines,1567.709717,-24218.75,False,94.0,0.0,0.0
1,a_00100,anti_vaccines,-1835.667725,25781.25,,0.0,0.0,0.0
2,a_001001,anti_vaccines,160.108688,-749.809448,False,22175.0,3.0,3.0
3,a_001002,anti_vaccines,-137.362717,-676.494263,True,423.0,7.0,6.0
4,a_001003,anti_vaccines,181.973236,-107.275665,True,289.0,8.0,8.0


## Edge

In [11]:
# Import Edges
edge_df = pd.read_excel('Original_Dataset/Vaccination_data.xlsx', header=[2654]).iloc[:7675]; 
edge_df;

In [12]:
# Drop Extra Columns
edge_col_drop = ['Unnamed: 1', 'Unnamed: 4', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14']
for col_drop in edge_col_drop:
    edge_df = edge_df.drop(col_drop, 1)
# Drop Extra Rows
edge_df = edge_df[edge_df.source != 'source']
edge_df.head()

Unnamed: 0,source,target,polarity,T1,capture_date
0,p_00667,n_00381,pro_vaccines,False,2019-10-05 00:00:00
1,a_00502,a_001280,an#_vaccines,True,2019-10-05 00:00:00
2,p_00914,p_001274,pro_vaccines,True,2019-10-05 00:00:00
3,p_00914,p_001233,pro_vaccines,True,2019-10-05 00:00:00
4,p_00914,p_001142,pro_vaccines,True,2019-10-05 00:00:00


In [13]:
# T1: edge_df1    T2: edge_df2
edge_df2 = edge_df
edge_df1 = edge_df[edge_df.T1 == 'TRUE']

## Fan size

In [15]:
# Import Size
size_df = pd.read_excel('Original_Dataset/Vaccination_data.xlsx', header=[10330]).iloc[:1359]; 
size_df;

In [16]:
# Drop Extra Columns
size_col_drop = ['Unnamed: 1', 'Unnamed: 3', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14']
for col_drop in size_col_drop:
    size_df = size_df.drop(col_drop, 1)
# Drop Extra Rows
size_df = size_df[size_df.selector != 'selector']
size_df.head()

Unnamed: 0,selector,key,value_t1,value_t2
0,a_00203,fan_count,260,261
1,a_00816,fan_count,3423,3507
2,p_00667,fan_count,1075,1400
3,p_00141,fan_count,720,734
4,a_00267,fan_count,116,117


In [17]:
fan_size_t1 = []; fan_size_t2 = []
for selector in node_df.selector.values:
    fan_size_t1.append(size_df[size_df.selector == selector].value_t1.values[0])
    fan_size_t2.append(size_df[size_df.selector == selector].value_t2.values[0])

In [18]:
node_df.insert(8, "fan_size_t1", fan_size_t1)
node_df.insert(9, "fan_size_t2", fan_size_t2)

In [19]:
node_df.head()

Unnamed: 0,selector,polarity,x,y,banner_cdc,fan_count,degree,in_degree,fan_size_t1,fan_size_t2
0,a_001,anti_vaccines,1567.709717,-24218.75,False,94.0,0.0,0.0,78,94
1,a_00100,anti_vaccines,-1835.667725,25781.25,,0.0,0.0,0.0,5082,0
2,a_001001,anti_vaccines,160.108688,-749.809448,False,22175.0,3.0,3.0,20773,22175
3,a_001002,anti_vaccines,-137.362717,-676.494263,True,423.0,7.0,6.0,409,423
4,a_001003,anti_vaccines,181.973236,-107.275665,True,289.0,8.0,8.0,294,289


# Create Graph

In [20]:
G1 = nx.DiGraph(); G2 = nx.DiGraph()

In [22]:
# Add nodes
node_name = node_df[["selector"]].values.tolist(); node_name = sum(node_name, [])
node_polarity = node_df[["polarity"]].values.tolist(); node_polarity = sum(node_polarity, [])

node_pos = node_df[["x", "y"]].values.tolist(); node_pos = [tuple(x) for x in node_pos]

fan_size_t1 = node_df[["fan_size_t1"]].values.tolist(); fan_size_t1 = sum(fan_size_t1, [])
fan_size_t2 = node_df[["fan_size_t2"]].values.tolist(); fan_size_t2 = sum(fan_size_t2, [])

for i in range(len(node_name)):
    G1.add_node(node_name[i], polarity = node_polarity[i], pos = node_pos[i], fan_size = fan_size_t1[i])
    G2.add_node(node_name[i], polarity = node_polarity[i], pos = node_pos[i], fan_size = fan_size_t2[i])

In [23]:
# Add edges
edge1 = edge_df1[["source", "target"]].values.tolist(); 
edge2 = edge_df2[["source", "target"]].values.tolist();

edgeincrease_df = edge_df[edge_df.T1 == 'FALSE']; 
edge_increase = edgeincrease_df[["source", "target"]].values.tolist();

for i in range(len(edge1)):
    G1.add_edge(edge1[i][0], edge1[i][1])

for i in range(len(edge2)):
    G2.add_edge(edge2[i][0], edge2[i][1])

## Save Graph

In [25]:
nx.write_gpickle(G1, "Reorganized_Dataset/G1.gpickle")
nx.write_gpickle(G2, "Reorganized_Dataset/G2.gpickle")

In [26]:
nx.info(G1)

'DiGraph with 1326 nodes and 5163 edges'

In [27]:
nx.info(G2)

'DiGraph with 1326 nodes and 7484 edges'

In [28]:
G1.nodes['a_001']

{'polarity': 'anti_vaccines', 'pos': (1567.709717, -24218.75), 'fan_size': 78}

In [29]:
G2.nodes['a_001']

{'polarity': 'anti_vaccines', 'pos': (1567.709717, -24218.75), 'fan_size': 94}

In [30]:
G1.edges[('a_001002', 'a_001251')]

{}

In [31]:
G2.edges[('a_001002', 'a_001251')]

{}