In [1]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

import warnings
warnings.filterwarnings('ignore')

# Track data loading
import psutil
import time

# Save&Load results
import pickle

# 1. Import Dataset

In [2]:
file_path = 'data/covid19_infodemic_RT_en.dat'

num_rows = sum(1 for line in open(file_path))

In [3]:
num_rows 

114161884

- **data_str**: fromUserID, toUserID, newsCategory

In [4]:
st = time.time()

data_str = np.loadtxt(file_path, max_rows = num_rows - 1, dtype = 'str', delimiter = '|', usecols = (3, 8, 13))

et = time.time()
elapsed_time = et - st

print('Execution time:', elapsed_time/60, 'mins')
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
# Getting usage of virtual_memory in GB ( 4th field)
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)

Execution time: 2.469197690486908 mins
RAM memory % used: 18.5
RAM Used (GB): 71.717261312


- **data_int**:'tweetID', 'tweetTimestamp', 'fromUserIsBot', 'fromUserIsVerified', 'fromUserFollowers', 'fromUserFriends', 'toUserIsBot', 'toUserIsVerified', 'toUserFollowers', 'toUserFriends'

In [5]:
st = time.time()

data_int = np.loadtxt(file_path, max_rows = num_rows - 1, dtype = np.int64, delimiter = "|", usecols = (0,1,4,5,6,7,9,10,11,12))

et = time.time()
elapsed_time = et - st

print('Execution time:', elapsed_time/60, 'mins')
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
# Getting usage of virtual_memory in GB ( 4th field)
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)

Execution time: 1.5251804312070212 mins
RAM memory % used: 20.7
RAM Used (GB): 80.679346176


- **data cleaning**

In [6]:
with open('Results/index_retain.pickle', 'rb') as file:
    index_retain =  pickle.load(file)

- **ID mapping**

In [7]:
with open('mappingID/user_dict_decode.pickle', 'rb') as file:
    user_dict_decode =  pickle.load(file)

In [8]:
user_dict_encode = dict(zip(user_dict_decode.values(), user_dict_decode.keys()))

In [9]:
with open('mappingID/newsCateogry_dict_decode.pickle', 'rb') as file:
    newsCateogry_dict_decode =  pickle.load(file)

In [10]:
newsCateogry_dict_encode = dict(zip(newsCateogry_dict_decode.values(), newsCateogry_dict_decode.keys()))

# 3. Construct Network (Unattributed)

**Weighted Directed Edges**: Weight obtained by the number of generated tweets

- Construct Network

In [16]:
G_unattributed = nx.DiGraph()
G_unattributed.add_nodes_from(np.arange(0, len(user_dict_encode), 1))

In [17]:
st = time.time()

for i in index_retain:
    from_user = user_dict_encode[data_str[:, 0][i]]
    to_user = user_dict_encode[data_str[:, 1][i]]
    timestamp = data_int[:, 1][i]
    
    # Add edge
    if G_unattributed.has_edge(from_user, to_user):
        G_unattributed.edges[(from_user, to_user)]["weight"] += 1
    else:
        G_unattributed.add_edge(from_user, to_user, weight = 1)
    
et = time.time()
elapsed_time = et - st

print('Execution time:', elapsed_time/60, 'mins')
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
# Getting usage of virtual_memory in GB ( 4th field)
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)

Execution time: 14.819688963890076 mins
RAM memory % used: 30.9
RAM Used (GB): 121.70164224


- Network Demo

In [19]:
G_unattributed.nodes[2]

{}

In [20]:
G_unattributed.edges[(2, 6716547)]

{'weight': 4}

In [21]:
G_unattributed.edges[(1, 4794247)]

{'weight': 1}

- Save network

In [22]:
nx.write_gpickle(G_unattributed, "data/graphs/G_origial_unattributed.gpickle")

# 4. Construct Network (Attributed)

**Node**: Attributed with tweetTimestamps corresponding followers, friends, bot detection and verification status

**Weighted Directed Edges**: Weight obtained by the number of generated tweets; Attributed with tweetTimestamps and newsCategory

- Construct Network

In [11]:
def initialise_network_node(G, attr):
    keys = np.arange(0, len(user_dict_encode), 1)
    values = [[] for _ in range(len(keys))]
    initial_attr = dict(zip(keys, values))
    
    nx.set_node_attributes(G, initial_attr, attr)

In [12]:
import networkx as nx

G_attributed = nx.DiGraph()
G_attributed.add_nodes_from(np.arange(0, len(user_dict_encode), 1))

initialise_network_node(G_attributed, "timeStamps")
initialise_network_node(G_attributed, "followers")
initialise_network_node(G_attributed, "friends")
initialise_network_node(G_attributed, "bot")
initialise_network_node(G_attributed, "verified")

In [14]:
st = time.time()

for i in index_retain:
    from_user = user_dict_encode[data_str[:, 0][i]]
    to_user = user_dict_encode[data_str[:, 1][i]]
    timestamp = data_int[:, 1][i]
    
    # Add node attribute
    from_bot = data_int[:, 2][i]
    from_verified = data_int[:, 3][i]
    from_follower_num = data_int[:, 4][i]
    from_friend_num = data_int[:, 5][i]
    
    to_bot = data_int[:, 6][i]
    to_verified = data_int[:, 7][i]
    to_follower_num = data_int[:, 8][i]
    to_friend_num = data_int[:, 9][i]
    
    G_attributed.nodes[from_user]["timeStamps"].append(timestamp)
    G_attributed.nodes[from_user]["followers"].append(from_follower_num)
    G_attributed.nodes[from_user]["friends"].append(from_friend_num)
    G_attributed.nodes[from_user]["bot"].append(from_bot)
    G_attributed.nodes[from_user]["verified"].append(from_verified)
    
    G_attributed.nodes[to_user]["timeStamps"].append(timestamp)
    G_attributed.nodes[to_user]["followers"].append(to_follower_num)
    G_attributed.nodes[to_user]["friends"].append(to_friend_num)
    G_attributed.nodes[to_user]["bot"].append(to_bot)
    G_attributed.nodes[to_user]["verified"].append(to_verified)
    
    # Add edge
    newsCategory = newsCateogry_dict_encode[data_str[:, -1][i]]
    
    if G_attributed.has_edge(from_user, to_user):
        G_attributed.edges[(from_user, to_user)]["weight"] += 1
        G_attributed.edges[(from_user, to_user)]["timeStamps"].append(timestamp)
        G_attributed.edges[(from_user, to_user)]["newsCategories"].append(newsCategory)
    else:
        G_attributed.add_edge(from_user, to_user, weight = 1, timeStamps = [timestamp], newsCategories = [newsCategory])
    
et = time.time()
elapsed_time = et - st

print('Execution time:', elapsed_time/60, 'mins')
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
# Getting usage of virtual_memory in GB ( 4th field)
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)

Execution time: 46.210902547836305 mins
RAM memory % used: 46.3
RAM Used (GB): 183.816839168


- Graph Demo

In [15]:
G_attributed.nodes[2]

{'timeStamps': [1648134989,
  1655339450,
  1655745435,
  1656693022,
  1657327488,
  1657641083,
  1657798795,
  1657801155,
  1658655141,
  1658798492,
  1659101815,
  1659137207,
  1660350179,
  1660611873,
  1660868454,
  1660868729,
  1661303199,
  1661864309,
  1661998205,
  1662170215,
  1663244690,
  1663254077,
  1663254547,
  1663255679,
  1663477716,
  1663477740,
  1663714563,
  1663896188,
  1664802550,
  1664805614,
  1664892010,
  1664894490,
  1664900594,
  1664904542,
  1664904658,
  1664975172,
  1664980615,
  1665016018,
  1665538674,
  1665944392,
  1666035337,
  1666097379,
  1667329353,
  1667427114,
  1667752492,
  1667962436,
  1668809635,
  1668985555,
  1669597004,
  1669766556,
  1669766698,
  1670176980,
  1670264155,
  1670264298,
  1670343738,
  1670435811,
  1670476924,
  1671331098,
  1671331757,
  1671664362,
  1671909976,
  1672501059,
  1672667372,
  1672775025,
  1672876871,
  1672932547,
  1673132543,
  1673218520,
  1673298762,
  1673378071,
  1673

In [16]:
G_attributed.edges[(2, 6716547)]

{'weight': 4,
 'timeStamps': [1674921343, 1675131132, 1675178972, 1675442728],
 'newsCategories': [4, 4, 4, 4]}

In [17]:
G_attributed.edges[(1, 4794247)]

{'weight': 1, 'timeStamps': [1615181439], 'newsCategories': [6]}

- Save Graph

In [None]:
st = time.time()

nx.write_gpickle(G_attributed, "data/graphs/G_origial.gpickle")

et = time.time()
elapsed_time = et - st

print('Execution time:', elapsed_time/60, 'mins')
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
# Getting usage of virtual_memory in GB ( 4th field)
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)