# Bursty Train Analysis

In [1]:
import os
import pandas as pd
import joblib
import numpy as np
from collections import Counter


#-----------------------------IMPORTANT-------------------------------------------
### VERY IMPORTANT: Changing the path from library to library2 change the folder of original results
import sys
sys.path.insert(0, '../Script/library3/')

from utilities import *
from randomization import *
from burst_func import *
import networkx as nx
from tqdm import tqdm

In [122]:
def burst_train(inter_ev, dt):
    """
    Calculate burst train statistics from a list of inter-event times.

    Args:
        inter_ev (list): List of inter-event times.
        dt (float): Time interval for burst detection.

    Returns:
        tuple: A tuple containing burst counts and burst distribution.
    """
    ev_distr = np.zeros(len(inter_ev))
    i = 0

    cnt = Counter()
    d = 0
    c = 1
    ev_distr[0] = d
    
    while i < len(inter_ev):
        if inter_ev[i] <= dt:
            ev_distr[i + 1] = d
            c += 1
            i += 1
        else:
            cnt.update(Counter([c]))
            d += 1
            if i < len(inter_ev) - 1:
                ev_distr[i + 1] = d
            i += 1
            c = 1
            continue
    
    # Assert that the burst counts match the burst distribution sum
    assert (sum([k * v for k, v in cnt.items()]) == np.sum([k * v for k, v in Counter(Counter(ev_distr).values()).items()]))
    
    return cnt, ev_distr

def get_burst_train(df, dt):
    """
    Calculate burst train statistics for a DataFrame of events.

    Args:
        df (DataFrame): DataFrame containing event data.
        dt (float): Time interval for burst detection.

    Returns:
        tuple: A tuple containing burst counts and the modified DataFrame with burst information.
    """
    df_rest = df.copy()
    int_ev_time = [x for x in (df_rest.timestamp.shift(-1) - df_rest.timestamp).values if ((x != np.nan))]
    b_train = burst_train(int_ev_time, dt)[0]
    bursts = burst_train(int_ev_time, dt)[1]
    df['burst'] = np.array(bursts)
    
    return Counter(b_train), df


### Compute Trains of egonetwork activity time series for Randomized reference models

In [118]:
def compute_bursty_trains(df, line_graph, dt_list, graph_list, seed, title1, include_single_link_burst=False, reproduce_paper=False):
    """
    Calculate bursty trains for a given DataFrame and save the results.

    Args:
        df (DataFrame): Input DataFrame containing temporal network data.
        line_graph (Graph): Graph representing the network.
        dt_list (list): List of time intervals to consider.
        graph_list (list): List of graphs to use in computations (e.g., ['G'], ['G1', 'G2', 'G3']).
        seed (int): Random seed for shuffling data.
        title1 (str): Title for the results.
        include_single_link_burst (bool): Include burst counts for single links.
        reproduce_paper (bool): Reproduce the paper's results.

    Returns:
        dict: A dictionary containing the computed bursty trains.
    """
    # Check assertions based on the input parameters
    assert graph_list
    if reproduce_paper:
        assert (graph_list == ['G']) or (graph_list == ['G1', 'G2', 'G3'])
        assert include_single_link_burst

    # Initialize dictionaries to store the results
    overall_dictionary = dict()
    df_dictionary = dict()

    # Loop through the graph list and initialize sub-dictionaries
    for k in graph_list:
        overall_dictionary[k] = dict()
        if include_single_link_burst:
            overall_dictionary[k]['burst_cnt_link'] = dict()
        overall_dictionary[k]['burst_cnt_ego'] = dict()
        overall_dictionary[k]['n_link_cnt'] = dict()

        # Create a copy of the DataFrame for each graph type
        if k == 'G':
            df_dictionary[k] = df.copy()
        elif k == 'G1':
            df_dictionary[k] = permute_timestamps(df, seed)  # Timestamp reshuffle
        elif k == 'G2':
            df_dictionary[k] = shuffle_df(df, seed)  # Time series reshuffle
        elif k == 'G3':
            df_dictionary[k] = random_df_same_weight(df, seed)  # Time series reshuffle with same n contacts

    # Loop through the specified time intervals
    for dt in dt_list:
        dt_dictionary = dict()

        # Loop through the graph list for each time interval
        for k in graph_list:
            dt_dictionary[k] = dict()
            if include_single_link_burst:
                dt_dictionary[k]['burst_cnt_link'] = Counter()
            dt_dictionary[k]['burst_cnt_ego'] = Counter()
            dt_dictionary[k]['n_link_cnt'] = Counter()

        # Iterate through unique nodes in the network
        for nodes in tqdm(df.nodes.unique()):
            neigh_set = (set(list(nx.neighbors(line_graph, nodes))) | set([nodes]))

            # Loop through the graph list for each node
            for k in graph_list:
                df_graph = df_dictionary[k]

                if include_single_link_burst:
                    df_link = df_graph[df_graph.nodes == nodes]
                    brst_cnt_link, _ = get_burst_train(df_link, dt)
                    dt_dictionary[k]['burst_cnt_link'].update(brst_cnt_link)

                df_ego = df_graph[df_graph.nodes.isin(neigh_set)]
                brst_cnt_ego, df_ego = get_burst_train(df_ego, dt)
                dt_dictionary[k]['burst_cnt_ego'].update(brst_cnt_ego)

                cnt_n_links_per_train = Counter(
                    df_ego.groupby('burst')['nodes'].apply(lambda x: (float(len(set(x))), float(len(list(x))))).values)
                dt_dictionary[k]['n_link_cnt'].update(cnt_n_links_per_train)

        # Store the results in the overall dictionary
        for k in graph_list:
            if include_single_link_burst:
                overall_dictionary[k]['burst_cnt_link'][dt] = dt_dictionary[k]['burst_cnt_link']
            overall_dictionary[k]['burst_cnt_ego'][dt] = dt_dictionary[k]['burst_cnt_ego']
            overall_dictionary[k]['n_link_cnt'][dt] = dt_dictionary[k]['n_link_cnt']

        # If reproducing the paper, save the results to files
        if reproduce_paper:
            if graph_list == ['G']:
                file_2_save = [overall_dictionary[k]['burst_cnt_link'][dt], overall_dictionary[k]['burst_cnt_ego'][dt],
                               Counter(), Counter(), Counter(), Counter()]
                file_2_save_link = [overall_dictionary[k]['n_link_cnt'][dt], Counter(), Counter(), Counter()]

                try:
                    os.mkdir('../Results/Bursty_trains/' + title1 + '/')
                except:
                    print('../Results/Bursty_trains/' + title1 + '/')

                title_burst_cnt = '../Results/Bursty_trains/' + title1 + '/' + title1 + '_' + str(dt)
                assert file_2_save[:2] == joblib.load(title_burst_cnt + '.joblib')[:2]

                title_link_cnt = '../Results/Bursty_trains/' + title1 + '/' + title1 + '_n_links_' + str(dt)
                assert file_2_save_link[0] == joblib.load(title_link_cnt + '.joblib')[0]

            elif graph_list == ['G1', 'G2', 'G3']:
                # The order of the list is weird, but this is the same as the code used for computing results in the paper.
                # It's present only in the option to reproduce the paper.
                file_2_save = [overall_dictionary['G2']['burst_cnt_ego'][dt], overall_dictionary['G1']['burst_cnt_link'][dt],
                               overall_dictionary['G1']['burst_cnt_ego'][dt], overall_dictionary['G3']['burst_cnt_ego'][dt]]
                file_2_save_link = [overall_dictionary['G2']['n_link_cnt'][dt], overall_dictionary['G1']['n_link_cnt'][dt],
                                    overall_dictionary['G3']['n_link_cnt'][dt]]

                try:
                    os.mkdir('../Results/Bursty_trains/' + title1 + 'rand/')
                except:
                    print('../Results/Bursty_trains/' + title1 + 'rand/')

                title_burst_cnt = '../Results/Bursty_trains/' + title1 + 'rand/' + title1 + '_' + str(dt) + '_' + str(seed)
                assert file_2_save == joblib.load(title_burst_cnt + '.joblib')

                title_link_cnt = '../Results/Bursty_trains/' + title1 + 'rand/' + title1 + '_n_links_' + str(dt) + '_' + str(seed)
                assert file_2_save_link == joblib.load(title_link_cnt + '.joblib')

    return overall_dictionary


### Compute Trains for time series of activity of top 10% most active links for Data

In [119]:

def compute_bursty_trains_most_active(df,line_graph,dt_list,title1,most_active = 0.1): 
    for dt in dt_list:

        cnt_tot = {}
        link_cnt_tot = {}
        most_active_links = df.groupby(df.nodes).size().sort_values(ascending = False)[:int(float(len(g.nodes))*most_active)].index
        link_weights = df.groupby(df.nodes).size()
        ### include all links with weight w>=w_min
        w_min = link_weights.sort_values(ascending = False)[:int(float(len(g.nodes))*0.1)].iloc[-1]
        most_active_links = node_rank[node_rank>=minimum_value].index


        cnt_n = Counter()
        cnt1_n_links = Counter()



        k = 0
        for nodes in tqdm(most_active_links):


            df_rest_most_active_links = df[df.nodes == nodes]   


            brst_cnt_most_active_links,df_rest_most_active_links = get_burst_train(df_rest_most_active_links,dt)






            cnt_n.update(brst_cnt_neigh)               ### data



        cnt_tot[title1] = cnt_n
        try:os.mkdir('../Results/Bursty_trains_most_active/')
        except: print 'folder already existing'
        #assert cnt_tot == joblib.load('../Results/Bursty_trains_most_active/'+title1+'_'+str(dt)+'.joblib')       
        joblib.dump(cnt_tot,'../Results/Bursty_trains_most_active/'+title1+'_'+str(dt)+'.joblib')       


### Example

In [120]:
seed = 1
dt_list = [60]
print seed


title1 = 'tij_lnVS'
df = get_df(title1)
g = get_linegraph(title1)
    

1


In [121]:
cc = compute_bursty_trains(df,g,dt_list,['G'],seed,title1,include_single_link_burst = True, reproduce_paper = True)

dd = compute_bursty_trains(df,g,dt_list,['G1','G2','G3'],seed,title1,include_single_link_burst = True, reproduce_paper = True)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 755/755 [01:33<00:00,  8.06it/s]


../Results/Bursty_trains/tij_lnVS/


  0%|          | 0/755 [00:00<?, ?it/s]

True


100%|██████████| 755/755 [05:17<00:00,  2.38it/s]

../Results/Bursty_trains/tij_lnVSrand/



