In [4]:
# %load ../scripts1/get_graphs_decay.py
import pickle
import itertools
import math
import random
import pandas as pd
import numpy as np
import networkx as nx

from datetime import date

def get_edge_slice(data,f_train_e=0.7,seed=30):
    df = data
    edges = list(df.Edge.unique())
    random.seed(seed)
    edge_train = random.sample(edges,int(f_train_e*len(edges)))
    edge_test = [e for e in edges if e not in edge_train]
    df_se = df.loc[df['Edge'].isin(edge_train)].drop(columns = ['Edge','Time'])
    df_de = df.loc[df['Edge'].isin(edge_test)].drop(columns = ['Edge','Time'])
    return(df_se,df_de)

def get_time_slice(data,time_start,time_end,droptime=True):
    if droptime:
        return(data[data.Year >= time_start][data.Year <= time_end].drop(columns = ['Year']))
    else:
        return(data[data.Year >= time_start][data.Year <= time_end])

def df_to_XY(df):
    if 'Year' in df.columns:
        df = df.drop(columns = ['Year'])
        if "Edge" in df.columns:
            df = df.drop(columns = ['Edge'])
    X,y = df.loc[:, df.columns != 'Label'].to_numpy(),\
    df.loc[:, df.columns == 'Label'].to_numpy()
    return(X,y)


def local_path(G,nodelist,epsilon = 0.01):
    A = nx.adjacency_matrix(G,weight=None).todense()
    return(A**2+epsilon*A**3)

def my_devide(a,b):
    if a!=0 and b!=0:
        return a/b
    else:
        return 0

def get_vector(G,edges):
    X,y = [],[]
    Ki = G.degree()
    nodelist = list(G.nodes())
    LPI = local_path(G,nodelist)
    NBs = dict([(u,set(G[u])) for u in G.nodes()])
    for e in edges:
        u,v = e
        union_size = len(NBs[u]|NBs[v])
        x = []
        uv_intersection = list(nx.common_neighbors(G,u,v))
        x.append(len(uv_intersection))
        x.append(my_devide(len(uv_intersection),np.sqrt(Ki[u]*Ki[v])))
        x.append(my_devide(len(uv_intersection),union_size))
        x.append(my_devide(2*len(uv_intersection),(Ki[u]+Ki[v])))
        x.append(my_devide(len(uv_intersection),min(Ki[u],Ki[v])))
        x.append(my_devide(len(uv_intersection),max(Ki[u],Ki[v])))
        x.append(my_devide(len(uv_intersection),(Ki[u]*Ki[v])))
        x.append(Ki[u]*Ki[v])
        if len(uv_intersection) == 0:
            x.append(0)
            x.append(0)
        else:
            x.append(sum([1/math.log(Ki[z]) for z in uv_intersection]))
            x.append(sum(1/Ki[z] for z in uv_intersection))
        x.append(LPI[nodelist.index(u),nodelist.index(v)])
        X.append(x)
    return(X,y)


def new_get_test(G,N=10000):
    nodes = list(G.nodes())
    edges = random.choices(list(itertools.combinations(nodes,2)),k=N)
    edges = list(set(edges).difference(set(G.edges())))
    X_test,y_test = get_vector(G,edges)
    return(edges,X_test,y_test)

def new_get_test_null(G,N=10000):
    nodes = list(G.nodes())
    edges = random.choices(list(itertools.combinations(nodes,2)),k=N)
    edges = list(set(edges).difference(set(G.edges())))
#    X_test,y_test = get_vector(G,edges)  
    return(edges)

df_removal = pd.read_pickle('../results/us_air_data_binary_removal.pkl').replace([np.inf, -np.inf], np.nan).dropna(how='all')
df_removal = df_removal[df_removal.year>=date(2004,1,1)]
rename_dict = {}
for column in df_removal.columns:
    if '_index' in column:
        column_v = column.replace('_index','',1)
    else:
        column_v =  column
    rename_dict[column]=column_v.replace('_',' ',3).title()

rename_again = {'Common Neighbor' : 'CN', 'Salton':'SA' , 'Jaccard':'JA', 'Sorensen':'SO', 'Hub Promoted':'HPI',
       'Hub Depressed':'HDI', 'Leicht Holme Newman':'LHNI', 'Preferential Attachment':'PA',
       'Adamic Adar':'AA', 'Resource Allocation':'RA', 'Local Path':'LP'}

df_removal = df_removal.rename(columns=rename_dict).rename(columns={'Prederential Attachment':'Preferential Attachment'})

columns=['Common Neighbor', 'Salton', 'Jaccard', 'Sorensen', 'Hub Promoted',
       'Hub Depressed', 'Leicht Holme Newman', 'Preferential Attachment',
       'Adamic Adar', 'Resource Allocation', 'Local Path',
        'Year','Edge','Time','Label']

df_removal =  df_removal[columns]#.rename(columns=rename_again)

df_add = pd.read_pickle('../results/us_air_data_binary_adding.pkl')
df_add = df_add[df_add.Year>=date(2004,1,1)]

df_air = pd.read_pickle('../data/networks/US_air_1990_2018.pkl')
nodes = set(df_air.source.unique())|set(df_air.target.unique())
df_air = df_air[df_air.source != df_air.target]
year = list(df_air.index.get_level_values(0).unique())
month = list(df_air.index.get_level_values(1).unique())
graphs_air = []
date_air = []
for y in year:
    for m in month:
#        if y != 2018 or m != 12:
        df = df_air.loc[y,m]
        date_air.append(date(y,m,1))
        G = nx.from_pandas_edgelist(df_air.loc[y,m], edge_attr=True)
        G.add_nodes_from(nodes)
        graphs_air.append(G)


n = date_air.index(date(2004,1,1))
graphs_air = graphs_air[n:]
date_air =  date_air[n:]

del df_air

removal_model = pickle.load(open("../results/binary_removal_model.pickle.dat", "rb"))
add_model = pickle.load(open("../results/binary_adding_model.pickle.dat", "rb"))

number_add = df_add[["Year",'Label']].groupby(by="Year").sum()
number_remove = df_removal[["Year",'Label']].groupby(by="Year").sum()

def get_graphs(remove_rate=1.02,add_rate=1.0):
    N_remove = int(number_add.mean().Label*remove_rate)
    N_add = int(number_add.mean().Label*add_rate)
    graphs_new1 = []
    graphs_null1 = []
    G = graphs_air[0].copy()
    G_null = graphs_air[0].copy()
#    for n in range(2)
    for n in range(len(graphs_air)-1):
        year_train = date_air[n]
#        N_remove = number_remove.loc[year_train,'Label']
#        N_add = number_add.loc[year_train,'Label']  
        
        edges = new_get_test_null(G_null)
        remove_edges_null = random.choices(edges,k=N_remove)
        add_edges_null = random.choices(edges,k=N_add)
        G_null.remove_edges_from(remove_edges_null)
        G_null.add_edges_from(add_edges_null)
        graphs_null1.append(G_null.copy())
        
        edges = list(G.edges())
        X,_ = get_vector(G,edges)
        pred_prob = removal_model.predict_proba(X).T[1]
        removal = zip(edges,pred_prob)
        removal = sorted(removal, key = lambda x: x[1])[0:N_remove]
        remove_edges = [i for i,_ in removal]    
        edges,X_test,y_test = new_get_test(G)
        y_pred = add_model.predict_proba(X_test).T[1]
        add_edges = [i for _,i in sorted(zip(y_pred,edges),reverse=True)][0:N_add]
        G.remove_edges_from(remove_edges)
        G.add_edges_from(add_edges)
        graphs_new1.append(G.copy())
        print(f'{n} of {len(graphs_air)-2} in loop {m}')
    return(graphs_new1,graphs_null1)

In [None]:
graphs_new1,graphs_null1 = get_graphs(1.03,1.0)

0 of 178 in loop 12
1 of 178 in loop 12
2 of 178 in loop 12
3 of 178 in loop 12
4 of 178 in loop 12
5 of 178 in loop 12
6 of 178 in loop 12
7 of 178 in loop 12
8 of 178 in loop 12
9 of 178 in loop 12
10 of 178 in loop 12
11 of 178 in loop 12
12 of 178 in loop 12
13 of 178 in loop 12
14 of 178 in loop 12
15 of 178 in loop 12
16 of 178 in loop 12
17 of 178 in loop 12
18 of 178 in loop 12
19 of 178 in loop 12
20 of 178 in loop 12
21 of 178 in loop 12
22 of 178 in loop 12
23 of 178 in loop 12
24 of 178 in loop 12
25 of 178 in loop 12
26 of 178 in loop 12
27 of 178 in loop 12
28 of 178 in loop 12
29 of 178 in loop 12
30 of 178 in loop 12
31 of 178 in loop 12
32 of 178 in loop 12
33 of 178 in loop 12
34 of 178 in loop 12
35 of 178 in loop 12
36 of 178 in loop 12
37 of 178 in loop 12
38 of 178 in loop 12
39 of 178 in loop 12
40 of 178 in loop 12
41 of 178 in loop 12
42 of 178 in loop 12
43 of 178 in loop 12
44 of 178 in loop 12
45 of 178 in loop 12
46 of 178 in loop 12
47 of 178 in loop 12
48