# Bursty Train Analysis

In [1]:
import os
import pandas as pd
import joblib
import numpy as np
from collections import Counter


#-----------------------------IMPORTANT-------------------------------------------
### VERY IMPORTANT: Changing the path from library to library2 change the folder of original results
import sys
sys.path.insert(0, '../Script/library3/')

from utilities import *
from randomization import *
from burst_func import *
import networkx as nx
from tqdm import tqdm

In [24]:
def burst_train(inter_ev,dt):
    ev_distr = np.zeros(len(inter_ev))
    i = 0
    
    cnt = Counter()
    d = 0
    c = 1
    ev_distr[0] = d
    while i < len(inter_ev):
        
        
        if inter_ev[i]<= dt:
            
            ev_distr[i+1] = d
            c+=1
            
            i +=1
    
        else:
            
            cnt.update(Counter([c]))
            
            d +=1
            
            if i<len(inter_ev)-1:ev_distr[i+1] = d
            
            i+=1
            c = 1
            
            continue
    assert (sum([k*v for k,v in cnt.items()]) == np.sum([k*v for k,v in Counter(Counter(ev_distr).values()).items()]))
    return cnt,ev_distr


def get_burst_train(df,dt):
    
    df_rest = df.copy()
    int_ev_time = [x for x in (df_rest.timestamp.shift(-1) - df_rest.timestamp).values if ((x!= np.nan))]
    b_train = burst_train(int_ev_time,dt)[0]
    bursts = burst_train(int_ev_time,dt)[1]
    df['burst'] = np.array(bursts)
    return Counter(b_train),df

### Compute Trains of egonetwork activity time series for Randomized reference models

In [94]:
def compute_bursty_trains(df,line_graph,dt_list,graph_list,seed,title1,include_single_link_burst = False, reproduce_paper = False):
    assert graph_list
    if reproduce_paper:
        assert (graph_list == ['G']) or (graph_list == ['G1','G2','G3'])
        assert include_single_link_burst
    overall_dictionary = dict()
    df_dictionary = dict()
    for k in graph_list:
        overall_dictionary[k] = dict()
        if include_single_link_burst: overall_dictionary[k]['burst_cnt_link'] = dict()
        overall_dictionary[k]['burst_cnt_ego'] = dict()
        overall_dictionary[k]['n_link_cnt'] = dict()
        if k == 'G': df_dictionary[k] = df.copy()
        elif k == 'G1': df_dictionary[k] = permute_timestamps(df,seed) ### timestamp reshuffle
        elif k == 'G2': df_dictionary[k] = shuffle_df(df,seed) ### timeseries reshuffle
        elif k == 'G3': df_dictionary[k] = random_df_same_weight(df,seed)  ### timeseries reshuffle with same n contacts
    
    for dt in dt_list:
        dt_dictionary = dict()
        for k in graph_list:
            dt_dictionary[k] = dict()
            if include_single_link_burst: dt_dictionary[k]['burst_cnt_link'] = Counter()
            dt_dictionary[k]['burst_cnt_ego'] = Counter()
            dt_dictionary[k]['n_link_cnt'] = Counter()
            
        

        for nodes in tqdm(df.nodes.unique()):

                neigh_set = (set(list(nx.neighbors(line_graph,nodes)))|set([nodes]))
                for k in graph_list:
                    df_graph = df_dictionary[k]
                    
                    
                    if include_single_link_burst: 
                        df_link = df_graph[df_graph.nodes == nodes]
                        brst_cnt_link,_ = get_burst_train(df_link,dt) 
                        dt_dictionary[k]['burst_cnt_link'].update(brst_cnt_link)
                        
                    df_ego = df_graph[df_graph.nodes.isin(neigh_set)]                    
                    brst_cnt_ego,df_ego = get_burst_train(df_ego,dt) 
                    dt_dictionary[k]['burst_cnt_ego'].update(brst_cnt_ego)
                    
                    cnt_n_links_per_train = Counter(df_ego.groupby('burst')['nodes'].apply(lambda x:(float(len(set(x))),float(len(list(x))))).values)
                    dt_dictionary[k]['n_link_cnt'].update(cnt_n_links_per_train)

        for k in graph_list:
            if include_single_link_burst: overall_dictionary[k]['burst_cnt_link'][dt] = dt_dictionary[k]['burst_cnt_link']
            overall_dictionary[k]['burst_cnt_ego'][dt] = dt_dictionary[k]['burst_cnt_ego']
            overall_dictionary[k]['n_link_cnt'][dt] = dt_dictionary[k]['n_link_cnt']
            
        if reproduce_paper:
            if graph_list == ['G']:
                file_2_save = [overall_dictionary[k]['burst_cnt_link'][dt],overall_dictionary[k]['burst_cnt_ego'][dt],Counter(),Counter(),Counter(),Counter()]
                file_2_save_link = [overall_dictionary[k]['n_link_cnt'][dt],Counter(),Counter(),Counter()]
                
                
                try: os.mkdir('../Results/Bursty_trains/'+title1+'/')
                except: print '../Results/Bursty_trains/'+title1+'/'

                title_burst_cnt = '../Results/Bursty_trains/'+title1+'/'+title1+'_'+str(dt)
                assert file_2_save == joblib.load(a+'.joblib')
    #         joblib.dump(file_2_save,a+'.joblib')

                title_link_cnt = '../Results/Bursty_trains/'+title1+'/'+title1+'_n_links_'+str(dt)
                assert file_2_save_link == joblib.load(b+'.joblib')
    #         joblib.dump(file_2_save_link,b+'.joblib')
            elif graph_list == ['G1','G2','G3']:
                #### the order of the list is weird, but this is the same of code used for computing results in the paper, it's present only in the option reproduce paper
                ### order: timeseries reshuffle, single, timestamp reshuffle, timeseries reshuffle with same n contacts
                file_2_save = [overall_dictionary['G2']['burst_cnt_ego'][dt],overall_dictionary['G1']['burst_cnt_link'][dt],overall_dictionary['G1']['burst_cnt_ego'][dt],overall_dictionary['G3']['burst_cnt_ego'][dt]]
                ### oreder: timeseries reshuffle, timestamp reshuffle, timeseries reshuffle with same n contacts
                file_2_save_link = [overall_dictionary['G2']['burst_cnt_ego'][dt],overall_dictionary['G1']['burst_cnt_ego'][dt],overall_dictionary['G3']['burst_cnt_ego'][dt]]
            
                try: os.mkdir('../Results/Bursty_trains/'+title1+'rand/')
                except: print '../Results/Bursty_trains/'+title1+'rand/'

                title_burst_cnt = '../Results/Bursty_trains/'+title1+'rand/'+title1+'_'+str(dt)+'_'+str(seed)
                assert file_2_save == joblib.load(title_burst_cnt+'.joblib')
    #             joblib.dump(file_2_save,title_burst_cnt+'.joblib')

                title_link_cnt = '../Results/Bursty_trains/'+title1+'rand/'+title1+'_n_links_'+str(dt)+'_'+str(seed)
                assert file_2_save_link == joblib.load(title_link_cnt+'.joblib')
    #             joblib.dump(file_2_save_link,title_link_cnt+'.joblib')

    return overall_dictionary
            
        
       
        

In [61]:
def compute_bursty_trains_rand(df,line_graph,dt_list,seed,title1):
    
    new_burst1 = {}
    neigh_new_burst1 = {}

    rand_orig_neigh_burst1 = {}
    rand_new_neigh_burst1 = {}



    n_link_rand_orig_neigh_burst1 = {}
    n_link_neigh_new_burst1 = {}
    n_link_rand_new_neigh_burst1 = {}

    df_new = permute_timestamps(df,seed)    ### timestamp reshuffle
    df1 = shuffle_df(df,seed)               ### timeseries reshuffle
    df1_n = random_df_same_weight(df,seed)  ### timeseries reshuffle with same n contacts
    for dt in dt_list:

        cnt1_s = Counter()

        cnt1_n = Counter()
        cnt_rn = Counter()
        cnt1_rn = Counter()


        cnt_rn_links = Counter()
        cnt1_n_links = Counter()
        cnt1_rn_links = Counter()

        for nodes in tqdm(df_new.nodes.unique()):

                neigh_set = (set(list(nx.neighbors(line_graph,nodes)))|set([nodes]))

                df_rest_new = df_new[df_new.nodes == nodes]

                df_rest_new_neigh = df_new[df_new.nodes.isin(neigh_set)] ### timestamp reshuffle
                df_rest_neigh_r = df1[df1.nodes.isin(neigh_set)]   ### timeseries reshuffle
                df_rest_new_neigh_r = df1_n[df1_n.nodes.isin(neigh_set)] ### timeseries reshuffle with same n contacts



                brst_cnt_new_neigh,df_rest_new_neigh = get_burst_train(df_rest_new_neigh,dt) ### timestamp reshuffle
                brst_cnt_neigh_r,df_rest_neigh_r = get_burst_train(df_rest_neigh_r,dt) ### timeseries reshuffle
                brst_cnt_new_neigh_r,df_rest_new_neigh_r = get_burst_train(df_rest_new_neigh_r,dt) ### timeseries reshuffle with same n contacts





                cnt1_s.update(get_burst_train(df_rest_new,dt=dt)[0])
                cnt1_n.update(brst_cnt_new_neigh)    ### timestamp reshuffle
                cnt_rn.update(brst_cnt_neigh_r)       ### timeseries reshuffle
                cnt1_rn.update(brst_cnt_new_neigh_r)   ### timeseries reshuffle with same n contacts






                cnt_rn_links.update((Counter(df_rest_neigh_r.groupby('burst')['nodes'].apply(lambda x:(float(len(set(x))),float(len(list(x))))).values)))
                cnt1_n_links.update((Counter(df_rest_new_neigh.groupby('burst')['nodes'].apply(lambda x:(float(len(set(x))),float(len(list(x))))).values)))
                cnt1_rn_links.update((Counter(df_rest_new_neigh_r.groupby('burst')['nodes'].apply(lambda x:(float(len(set(x))),float(len(list(x))))).values)))




        new_burst1[title1+str(dt)] = cnt1_s
        neigh_new_burst1[title1+str(dt)] = cnt1_n      ### timestamp reshuffle
        rand_orig_neigh_burst1[title1+str(dt)] = cnt_rn  ### timeseries reshuffle
        rand_new_neigh_burst1[title1+str(dt)] = cnt1_rn   ### timeseries reshuffle with same n contacts



        n_link_rand_orig_neigh_burst1[title1+str(dt)] = cnt_rn_links            ### timeseries reshuffle
        n_link_neigh_new_burst1[title1+str(dt)] = cnt1_n_links                  ### timestamp reshuffle
        n_link_rand_new_neigh_burst1[title1+str(dt)] = cnt1_rn_links            ### timeseries reshuffle with same n contacts

        ### oreder: timeseries reshuffle, single, timestamp reshuffle, timeseries reshuffle with same n contacts
        file_2_save = map(lambda x: x[title1+str(dt)],[rand_orig_neigh_burst1,new_burst1,neigh_new_burst1,rand_new_neigh_burst1])
        ### oreder: timeseries reshuffle, timestamp reshuffle, timeseries reshuffle with same n contacts
        file_2_save_link = map(lambda x: x[title1+str(dt)],[n_link_rand_orig_neigh_burst1,n_link_neigh_new_burst1,n_link_rand_new_neigh_burst1])

        try: os.mkdir('../Results/Bursty_trains/'+title1+'rand/')
        except: print '../Results/Bursty_trains/'+title1+'rand/'
       
        a = '../Results/Bursty_trains/'+title1+'rand/'+title1+'_'+str(dt)+'_'+str(seed)
        assert file_2_save == joblib.load(a+'.joblib')
        #joblib.dump(file_2_save,a+'.joblib')

        b = '../Results/Bursty_trains/'+title1+'rand/'+title1+'_n_links_'+str(dt)+'_'+str(seed)
        assert file_2_save_link == joblib.load(b+'.joblib')
        return file_2_save, file_2_save_link
        #joblib.dump(file_2_save_link,b+'.joblib')

### Compute Trains of egonetwork activity timer series for Data

In [8]:
def compute_bursty_trains_data(df,line_graph,dt_list,title1, save = False):
    burst_cnt_G_link_tot = {}
    brst_cnt_G_ego_tot = {}

    n_links_cnt_G_ego_tot = {}
    
    #    if title1 !='tij_lnVS':continue
    for dt in dt_list:
        burst_cnt_G_link_dt = Counter()
        brst_cnt_G_ego_dt = Counter()

        n_links_cnt_G_ego_dt = Counter()
        
        k = 0
        for nodes in tqdm(df.nodes.unique()):

            neigh_set = (set(list(nx.neighbors(g,nodes)))|set([nodes]))
            df_G_link = df[df.nodes == nodes]
            df_G_ego = df[df.nodes.isin(neigh_set)]                    ### data



            brst_cnt_G_ego,df_G_ego = get_burst_train(df_G_ego,dt)
            
            burst_cnt_G_link_dt.update(get_burst_train(df_G_link,dt)[0])
            brst_cnt_G_ego_dt.update(brst_cnt_G_ego)               ### data
            n_links_cnt_G_ego_dt.update((Counter(df_G_ego.groupby('burst')['nodes'].apply(lambda x:(float(len(set(x))),float(len(list(x))))).values)))

            k += 1

        burst_cnt_G_link_tot[title1+str(dt)] = burst_cnt_G_link_dt
        brst_cnt_G_ego_tot[title1+str(dt)] = brst_cnt_G_ego_dt         ### data
        n_links_cnt_G_ego_tot[title1+str(dt)] = n_links_cnt_G_ego_dt
        

        file_2_save = map(lambda x: x[title1+str(dt)],[burst_cnt_G_link_tot,brst_cnt_G_ego_tot])
        file_2_save_link = map(lambda x: x[title1+str(dt)],[n_links_cnt_G_ego_tot])


        try: os.mkdir('../Results/Bursty_trains/'+title1+'/')
        except: print '../Results/Bursty_trains/'+title1+'/'
       
        title_burst_cnt = '../Results/Bursty_trains/'+title1+'/'+title1+'_'+str(dt)
        assert file_2_save == joblib.load(a+'.joblib')
#         joblib.dump(file_2_save,a+'.joblib')

        title_link_cnt = '../Results/Bursty_trains/'+title1+'/'+title1+'_n_links_'+str(dt)
        assert file_2_save_link == joblib.load(b+'.joblib')
#         joblib.dump(file_2_save_link,b+'.joblib')

    return burst_cnt_G_link_tot,brst_cnt_G_ego_tot,n_links_cnt_G_ego_tot




### Compute Trains for time series of activity of top 10% most active links for Data

In [5]:

def compute_bursty_trains_most_active(df,line_graph,dt_list,title1,most_active = 0.1): 
    for dt in dt_list:

        cnt_tot = {}
        link_cnt_tot = {}
        most_active_links = df.groupby(df.nodes).size().sort_values(ascending = False)[:int(float(len(g.nodes))*most_active)].index
        link_weights = df.groupby(df.nodes).size()
        ### include all links with weight w>=w_min
        w_min = link_weights.sort_values(ascending = False)[:int(float(len(g.nodes))*0.1)].iloc[-1]
        most_active_links = node_rank[node_rank>=minimum_value].index


        cnt_n = Counter()
        cnt1_n_links = Counter()



        k = 0
        for nodes in tqdm(most_active_links):


            df_rest_most_active_links = df[df.nodes == nodes]   


            brst_cnt_most_active_links,df_rest_most_active_links = get_burst_train(df_rest_most_active_links,dt)






            cnt_n.update(brst_cnt_neigh)               ### data



        cnt_tot[title1] = cnt_n
        try:os.mkdir('../Results/Bursty_trains_most_active/')
        except: print 'folder already existing'
        #assert cnt_tot == joblib.load('../Results/Bursty_trains_most_active/'+title1+'_'+str(dt)+'.joblib')       
        joblib.dump(cnt_tot,'../Results/Bursty_trains_most_active/'+title1+'_'+str(dt)+'.joblib')       


### Example

In [62]:
seed = 1
dt_list = [60]
print seed


title1 = 'tij_lnVS'
df = get_df(title1)
g = get_linegraph(title1)
    

1


In [None]:
cc = compute_bursty_trains(df,g,dt_list,['G1','G2','G3'],seed,title1,include_single_link_burst = True, reproduce_paper = False)





  0%|          | 0/755 [00:00<?, ?it/s]

True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  4%|▎         | 28/755 [00:13<05:11,  2.34it/s]

In [36]:
overall_dixtionary = _

In [82]:
overall_dictionary = cc

In [83]:
file_2_save = [overall_dictionary['G2']['burst_cnt_ego'][dt],overall_dictionary['G1']['burst_cnt_link'][dt],overall_dictionary['G1']['burst_cnt_ego'][dt],overall_dictionary['G3']['burst_cnt_ego'][dt]]

In [54]:
len(file_2_save)

4

In [58]:
file_2_save[3]

Counter({1: 67754,
         2: 23578,
         3: 10548,
         4: 6001,
         5: 4288,
         6: 2683,
         7: 1959,
         8: 1352,
         9: 843,
         10: 812,
         11: 1106,
         12: 743,
         13: 445,
         14: 398,
         15: 290,
         16: 470,
         17: 271,
         18: 358,
         19: 190,
         20: 231,
         21: 82,
         22: 142,
         23: 196,
         24: 76,
         25: 195,
         26: 189,
         27: 34,
         28: 71,
         29: 108,
         30: 115,
         31: 63,
         32: 2,
         33: 127,
         34: 7,
         35: 34,
         36: 51,
         37: 10,
         38: 6,
         39: 10,
         40: 6,
         41: 7,
         42: 26,
         43: 6,
         44: 55,
         45: 7,
         46: 25,
         47: 5,
         48: 5,
         49: 3,
         50: 34,
         51: 16,
         52: 8,
         53: 6,
         55: 1,
         56: 2,
         57: 13,
         58: 8,
         59: 2,


In [44]:
a = joblib.load('../Results/Bursty_trains/tij_lnVSrand/tij_lnVS_60_0.joblib')

In [75]:
bb[0] == aa

False

In [74]:
a

[Counter({1: 69187,
          2: 22705,
          3: 10673,
          4: 6082,
          5: 4485,
          6: 2728,
          7: 2171,
          8: 1488,
          9: 1026,
          10: 856,
          11: 1160,
          12: 746,
          13: 489,
          14: 480,
          15: 349,
          16: 418,
          17: 245,
          18: 409,
          19: 177,
          20: 279,
          21: 126,
          22: 152,
          23: 183,
          24: 57,
          25: 112,
          26: 193,
          27: 58,
          28: 71,
          29: 61,
          30: 128,
          31: 55,
          32: 36,
          33: 124,
          34: 54,
          35: 63,
          36: 23,
          37: 7,
          38: 3,
          39: 6,
          40: 4,
          41: 2,
          42: 60,
          43: 2,
          44: 60,
          45: 8,
          46: 3,
          47: 4,
          48: 1,
          49: 13,
          50: 29,
          51: 6,
          52: 16,
          53: 3,
          54: 23,
         

In [63]:
aa = compute_bursty_trains_rand(df,g,dt_list,seed,title1)



  0%|          | 0/755 [00:00<?, ?it/s]

True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 755/755 [04:20<00:00,  2.90it/s]

../Results/Bursty_trains/tij_lnVSrand/





In [80]:
cc = compute_bursty_trains(df,g,[60],['G1','G2','G3'],1,title1,include_single_link_burst = True, reproduce_paper = False)






  0%|          | 0/755 [00:00<?, ?it/s]

True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 755/755 [05:01<00:00,  2.50it/s]


In [81]:
cc

{'G1': {'burst_cnt_ego': {60: Counter({1: 243320,
            2: 40972,
            3: 9754,
            4: 3116,
            5: 1170,
            6: 422,
            7: 182,
            8: 116,
            9: 65,
            10: 24,
            11: 24,
            12: 14,
            13: 8,
            14: 4,
            15: 4,
            16: 3,
            17: 2})},
  'burst_cnt_link': {60: Counter({1: 9039, 2: 296, 3: 45, 4: 12, 5: 1, 8: 1})},
  'n_link_cnt': {60: Counter({(1.0, 1.0): 243320,
            (1.0, 2.0): 6464,
            (1.0, 3.0): 746,
            (1.0, 4.0): 200,
            (1.0, 5.0): 30,
            (1.0, 8.0): 6,
            (2.0, 2.0): 34508,
            (2.0, 3.0): 3020,
            (2.0, 4.0): 559,
            (2.0, 5.0): 169,
            (2.0, 6.0): 21,
            (2.0, 7.0): 1,
            (2.0, 9.0): 7,
            (3.0, 3.0): 5988,
            (3.0, 4.0): 1141,
            (3.0, 5.0): 338,
            (3.0, 6.0): 125,
            (3.0, 7.0): 27,
        

In [90]:
bb = joblib.load('../Results/Bursty_trains/tij_lnVSrand/tij_lnVS_60_1.joblib')

In [72]:
aa[3]

IndexError: tuple index out of range

In [91]:
aa[0] == bb

True

In [92]:
aa[0] == file_2_save

True

In [88]:
aa[3]

IndexError: tuple index out of range