In [1]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

import warnings
warnings.filterwarnings('ignore')

# Track data loading
import psutil
import time

# Save&Load results
import pickle

In [2]:
def load_graph(address, verbose = True):
    
    st = time.time()

    with open(address, 'rb') as f:
        G = pickle.load(f)

    et = time.time()
    elapsed_time = et - st
    
    if verbose:
        print('Execution time:', elapsed_time/60, 'mins')
        # Getting % usage of virtual_memory ( 3rd field)
        print('RAM memory % used:', psutil.virtual_memory()[2])
        # Getting usage of virtual_memory in GB ( 4th field)
        print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)
    
    return G

In [3]:
def print_graph_info(G):
    print("Graph Type: ", type(G))
    print("Node Num: ", G.number_of_nodes())
    print("Edge Num: ", G.number_of_edges())
    print("Weight Sum: ", G.size(weight="weight"))

# 1. Original Graph

We construct from the entire big dataset, a directed weighted network with attributes.

In [4]:
G_original = load_graph("Graphs/G_original.gpickle")

Execution time: 18.905131928126018 mins
RAM memory % used: 61.4
RAM Used (GB): 245.243461632


In [5]:
print_graph_info(G_original)

Graph Type:  <class 'networkx.classes.digraph.DiGraph'>
Node Num:  13948159
Edge Num:  72331737
Weight Sum:  113685100.0


In [6]:
G_original.nodes[2]  # example

{'timeStamps': array([1648134989, 1655339450, 1655745435, 1656693022, 1657327488,
        1657641083, 1657798795, 1657801155, 1658655141, 1658798492,
        1659101815, 1659137207, 1660350179, 1660611873, 1660868454,
        1660868729, 1661303199, 1661864309, 1661998205, 1662170215,
        1663244690, 1663254077, 1663254547, 1663255679, 1663477716,
        1663477740, 1663714563, 1663896188, 1664802550, 1664805614,
        1664892010, 1664894490, 1664900594, 1664904542, 1664904658,
        1664975172, 1664980615, 1665016018, 1665538674, 1665944392,
        1666035337, 1666097379, 1667329353, 1667427114, 1667752492,
        1667962436, 1668809635, 1668985555, 1669597004, 1669766556,
        1669766698, 1670176980, 1670264155, 1670264298, 1670343738,
        1670435811, 1670476924, 1671331098, 1671331757, 1671664362,
        1671909976, 1672501059, 1672667372, 1672775025, 1672876871,
        1672932547, 1673132543, 1673218520, 1673298762, 1673378071,
        1673472524, 1673640577, 16

In [7]:
G_original.edges[(2, 1480708)]  # example

{'weight': 5,
 'timeStamps': array([1663896188, 1664904542, 1664904658, 1670264155, 1670264298]),
 'newsCategories': array([4, 3, 8, 8, 4])}

Since loading the attributed original network takes time, we also provide an unattributed version for faster loading when attributes are not needed. 

In [8]:
G_original_unattributed = load_graph("Graphs/G_original_unattributed.gpickle")

Execution time: 2.355726182460785 mins
RAM memory % used: 14.5
RAM Used (GB): 55.797424128


In [16]:
print_graph_info(G_original_unattributed)

Graph Type:  <class 'networkx.classes.digraph.DiGraph'>
Node Num:  13948159
Edge Num:  72331737
Weight Sum:  113685100.0


In [10]:
G_original_unattributed.nodes[2] # example

{}

In [13]:
G_original_unattributed.edges[(2, 1480708)] # example

{'weight': 5}

# 2. Filtered Graph

Here we have filtered the original network, so as to retain statistically significant edges only.

In [17]:
G_filtered = load_graph("Graphs/G_filtered.gpickle")

Execution time: 1.415244960784912 mins
RAM memory % used: 17.9
RAM Used (GB): 69.35392256


In [18]:
print_graph_info(G_filtered)

Graph Type:  <class 'networkx.classes.digraph.DiGraph'>
Node Num:  767553
Edge Num:  2165564
Weight Sum:  29991328.0


In [19]:
G_filtered.nodes[2] # example

{'timeStamps': array([1648134989, 1655339450, 1655745435, 1656693022, 1657327488,
        1657641083, 1657798795, 1657801155, 1658655141, 1658798492,
        1659101815, 1659137207, 1660350179, 1660611873, 1660868454,
        1660868729, 1661303199, 1661864309, 1661998205, 1662170215,
        1663244690, 1663254077, 1663254547, 1663255679, 1663477716,
        1663477740, 1663714563, 1663896188, 1664802550, 1664805614,
        1664892010, 1664894490, 1664900594, 1664904542, 1664904658,
        1664975172, 1664980615, 1665016018, 1665538674, 1665944392,
        1666035337, 1666097379, 1667329353, 1667427114, 1667752492,
        1667962436, 1668809635, 1668985555, 1669597004, 1669766556,
        1669766698, 1670176980, 1670264155, 1670264298, 1670343738,
        1670435811, 1670476924, 1671331098, 1671331757, 1671664362,
        1671909976, 1672501059, 1672667372, 1672775025, 1672876871,
        1672932547, 1673132543, 1673218520, 1673298762, 1673378071,
        1673472524, 1673640577, 16

In [20]:
G_filtered.edges[(2, 1480708)] # example

{'weight': 5,
 'timeStamps': array([1663896188, 1664904542, 1664904658, 1670264155, 1670264298]),
 'newsCategories': array([4, 3, 8, 8, 4]),
 'newsCategoriesGeneralised': array([2, 0, 0, 0, 2])}

# 3. Temporal Subgraphs

We also provide the following functions to extract subgraphs only containing retweets during specific time period.

In [21]:
from datetime import datetime

def cal_timestamps(date):
    
    start_timestamp = 1584482894
    
    # Define the start and end dates
    start_date = datetime.strptime('2020-03-17', '%Y-%m-%d')
    end_date = datetime.strptime(date, '%Y-%m-%d')

    # Calculate the difference in days
    difference_in_days = (end_date - start_date).days
    
    return start_timestamp + difference_in_days*24*60*60

In [25]:
def get_temporal_subgraph(G, latest_start, earliest_end, verbose = True):
    
    st = time.time()
    
    subG = nx.DiGraph()
    
    node_timeStamps_dict = nx.get_node_attributes(G, "timeStamps")
    edge_timeStamps_dict = nx.get_edge_attributes(G, "timeStamps")
    
    for k,v in node_timeStamps_dict.items():
        index_mask = np.logical_and(v >= latest_start, v <= earliest_end)
        if np.sum(index_mask) > 0:
            subG.add_node(k, timeStamps = G.nodes[k]['timeStamps'][index_mask], 
                             followers = G.nodes[k]['followers'][index_mask],
                             friends = G.nodes[k]['friends'][index_mask],
                             bot = G.nodes[k]['bot'][index_mask],
                             verified = G.nodes[k]['verified'][index_mask])
            
    for k,v in edge_timeStamps_dict.items():
        index_mask = np.logical_and(v >= latest_start, v <= earliest_end)
        if np.sum(index_mask) > 0:
            subG.add_edge(k[0], k[1], weight = np.sum(index_mask),
                                      timeStamps = G.edges[k]['timeStamps'][index_mask], 
                                      newsCategories = G.edges[k]['newsCategories'][index_mask],
                                      newsCategoriesGeneralised = G.edges[k]['newsCategoriesGeneralised'][index_mask])
    
    et = time.time()
    elapsed_time = et - st
    
    if verbose:
        print('Execution time:', elapsed_time/60, 'mins')
        # Getting % usage of virtual_memory ( 3rd field)
        print('RAM memory % used:', psutil.virtual_memory()[2])
        # Getting usage of virtual_memory in GB ( 4th field)
        print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)
        
    return subG

- Example 1 (also stored as "G_filtered_sub1.gpickle")

In [23]:
latest_start = cal_timestamps("2020-11-17")
earliest_end = cal_timestamps("2021-04-17")

In [26]:
G_filtered_sub1 = get_temporal_subgraph(G_filtered, latest_start, earliest_end)

Execution time: 0.9564726670583089 mins
RAM memory % used: 18.7
RAM Used (GB): 72.53110784


In [27]:
print_graph_info(G_filtered_sub1)

Graph Type:  <class 'networkx.classes.digraph.DiGraph'>
Node Num:  589056
Edge Num:  1181826
Weight Sum:  5625119.0


In [31]:
G_filtered_sub1.nodes[26] # example

{'timeStamps': array([1605672432, 1606289507, 1606521767, 1607032140, 1607063024,
        1607670568, 1607836752, 1607839129, 1607895469, 1607905374,
        1607906033, 1607973754, 1608190572, 1608231168, 1608355516,
        1608877286, 1610940635, 1612057318, 1613366818, 1613790610,
        1615276042, 1616051274, 1616535967]),
 'followers': array([2930, 2945, 2948, 2957, 2957, 2977, 2976, 2976, 2977, 2977, 2977,
        2978, 2988, 2988, 2991, 2986, 3003, 3016, 3042, 3041, 3065, 3073,
        3081]),
 'friends': array([5002, 5000, 4999, 4995, 4995, 4999, 5000, 5000, 4999, 4999, 4999,
        4996, 5001, 5001, 5000, 4989, 4999, 4996, 4997, 4999, 4993, 4990,
        4984]),
 'bot': array([0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0]),
 'verified': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])}

In [34]:
G_filtered_sub1.edges[(26, 10628236)] # example

{'weight': 2,
 'timeStamps': array([1605672432, 1612057318]),
 'newsCategories': array([9, 3]),
 'newsCategoriesGeneralised': array([2, 0])}

- Example 2 (also stored as "G_filtered_sub2.gpickle")

In [35]:
latest_start = cal_timestamps("2021-11-17")
earliest_end = cal_timestamps("2022-04-17")

In [36]:
G_filtered_sub2 = get_temporal_subgraph(G_filtered, latest_start, earliest_end)

Execution time: 0.8051692962646484 mins
RAM memory % used: 21.5
RAM Used (GB): 83.987980288


In [37]:
print_graph_info(G_filtered_sub2)

Graph Type:  <class 'networkx.classes.digraph.DiGraph'>
Node Num:  437991
Edge Num:  729156
Weight Sum:  2983210.0


In [38]:
G_filtered_sub2.nodes[26] # example

{'timeStamps': array([1642537314]),
 'followers': array([3196]),
 'friends': array([4978]),
 'bot': array([0]),
 'verified': array([0])}

In [41]:
G_filtered_sub2.edges[(148, 1166275)] # example

{'weight': 2,
 'timeStamps': array([1640647341, 1641074689]),
 'newsCategories': array([4, 4]),
 'newsCategoriesGeneralised': array([2, 2])}