In [2]:
import pandas as pd
import numpy as np
from random import sample
from cmath import exp, sqrt
import csv

In [3]:
filepath = "./datasets/Emails.csv"
headers = ["ID of from node","ID of to node", "weight", "timestamp"]

Graph = pd.read_csv(filepath, names=headers, sep=' ').drop_duplicates()
V=np.unique((Graph['ID of from node']._append(Graph['ID of to node'])).values).astype(int)
V_num = V.size

In [None]:
# generate adjacency list for new dataset - old method
# def getValues(i):
#     return [ x[0] for x in Graph.loc[(Graph['ID of from node']==i), ['ID of to node']].drop_duplicates().values] + [ x[0] for x in Graph.loc[(Graph['ID of to node']==i), ['ID of from node']].drop_duplicates().values]

# matrix = {i: set(getValues(i)) for i in V}

In [None]:
# generate adjacency list for new dataset - new method
matrix = {}

def add_adjacent(node, adj):
    if node not in matrix:
        adjacent = set()
        adjacent.add(adj)
        matrix[node] = adjacent
    else:
        matrix[node].add(adj)

for index, row in Graph.iterrows():
    u = int(row['ID of from node'])
    v = int(row['ID of to node'])

    if u == v: # to skip loops (in case they're present in dataset), because dissartotivity degree formula is 2m/n(n-1) (according to paper)
        continue
    
    add_adjacent(u, v)
    add_adjacent(v, u)

In [None]:
# print adjacency list (testing purposes)
for i in V:
    print(matrix[i])

In [5]:
# export/import generated adjacency list with whitespaces separators
def export_adj():
    filepath = "./adj-lists/adjacency-list-emails.csv"

    with open(filepath, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=' ')

        for node in V:
            writer.writerow([node] + list(matrix[node]))

def import_adj():
    matrix = {}
    filepath = "./adj-lists/adjacency-list-emails.csv"

    with open(filepath, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=' ')
        
        for row in reader:
            node = int(row[0])
            adjacent = set(int(row[i]) for i in range(1, len(row)))
            matrix[node] = adjacent

    return matrix

In [6]:
# call export/import functions
# export_adj()
matrix = import_adj()

In [None]:
# Часть 1. 
#Задание 1
E_num = 0
for i in V:
    for s in matrix[i]:
        if s>i:
            E_num+=1
print(f'количество вершин: {V_num};\nколичество ребер: {E_num};\nплотность: {2*E_num/(V_num*(V_num-1))}')

In [None]:
visited = set(V)
Component = set()
answer = 0

while len(visited):
    answer += 1
    v = visited.pop()
    comp = set([v])
    candidates = matrix[v].copy()
    while len(candidates):
        newCan = set()
        for i in candidates:
            newCan.update(matrix[i])
        comp.update(candidates)
        candidates = newCan.difference(comp)
    visited -= comp
    if len(comp) > len(Component):
        Component = comp.copy()



print(f'Количество компонент слабой связности: {answer};\nРазмер максимальной компоненты: {len(Component)};\nДоля вершин в максимальной компоненте: {len(Component)/V_num}' )

In [None]:
# Задание 3
def Cl(u):
    if len(matrix[u])<2:
        return 0
    neib = matrix[u]
    G = len(neib)
    _2L=0
    for our in neib:
        _2L+=len(matrix[our].intersection(neib))
    return _2L/(G*(G-1))

CL = 0
for node in Component:
    CL+=Cl(node)
print(f'средний кластерный коэффициент сети: {CL/V_num}')

In [None]:
# Задание 4
def R():
    r1, r2, r3, re = 0, 0, 0, 0
    for node in V:
        u = len(matrix[node])
        r1+=u
        r2+=u*u
        r3+=u*u*u
        for to in matrix[node]:
            re+=u*len(matrix[to])
    return (re*r1-r2*r2)/(r3*r1-r2*r2)

r = R()
print(f'Коэффициент ассортативности: {r}')

In [None]:
#Часть 2.
#статические характеристики
CN_static={}
AA_static={}
JC_static={}
PA_static={}
visited=set()
for adj in V:
    visited.add(adj)
    for node in matrix[adj]:
        if node not in visited:
            inter_adj_node = matrix[adj] & matrix[node]
            CN_static[(adj, node)] = len(inter_adj_node)
            AA_=0
            for i in inter_adj_node:
                AA_+=(1/np.log10(len(matrix[i])))
            AA_static[(adj, node)] = AA_
            JC_static[(adj, node)] = len(inter_adj_node)/len(matrix[adj].union(matrix[node]))
            PA_static[(adj, node)] = len(matrix[adj])*len(matrix[node])


In [7]:
# for weighting step
t_min = Graph['timestamp'].min()
t_max = Graph['timestamp'].max()
t_max = (t_max-t_min)*0.75+t_min

In [8]:
# Часть 2.
# Построение векторов признаков для предсказания появления ребер в графе
# create adjacency list with timestamps:
matrix_t = dict.fromkeys(matrix.keys(), dict())

def add_time (parent, child, timestamp):
    if child not in matrix_t[parent]:
        timeset = set()
        timeset.add(timestamp)
        matrix_t[parent][child] = timeset
    else:
        matrix_t[parent][child].add(timestamp)

for index, row in Graph.iterrows():
    u = int(row['ID of from node'])
    v = int(row['ID of to node'])
    timestamp = int(row['timestamp'])

    if u == v: # skip loops
        continue
    
    add_time(u, v, timestamp)
    add_time(v, u, timestamp)

In [None]:
# test purposes
print(matrix_t[165][2])

In [None]:
# test output for temporal adjacency matrix
for i in range(len(matrix_t)):
    print(matrix_t[i])

In [9]:
# Temporal features with past event aggreagtion (II-A)
# Step A: temporal weighting
l = 0.2 # same value as in paper

def weight_linear(times):
    weights = set()
    for t in times:
        T = (t - t_min) / (t_max - t_min)
        weights.add(l + (1 - l) * T)
    
    return weights

def weight_exp(times):
    weights = set()
    for t in times:
        T = (t - t_min) / (t_max - t_min)
        weights.add(l + (1 - l) * ((exp(3 * T) - 1) / (exp(3) - 1)))

    return weights

def weight_square(times):
    weights = set()
    for t in times:
        T = (t - t_min) / (t_max - t_min)
        weights.add(l + (1 - l) * sqrt(T))
    
    return weights

In [10]:
# Temporal features with past event aggreagtion (II-A)
# Step B: past event aggregation
def aggregate(weights):
    # q-quantiles are values that partition a finite set of values into q subsets of (nearly) equal sizes
    warr = np.array(list(weights))

    zeroth = warr.min() # 0th quantile = minimum
    first = warr.max() # 1st quantile = maximum
    second = np.median(warr) # 2nd quantile = median
    third = np.quantile(warr, 0.3) # 3rd quantile = tertile
    fourth = np.quantile(warr, 0.25) # 4th quantile = quartile

    sum = np.sum(warr)
    mean = np.mean(warr)
    variance = np.var(warr)

    return [zeroth, first, second, third, fourth, sum, mean, variance]

In [None]:
# Temporal features with past event aggreagtion (II-A)
# dict-like structure initialization: (node1, node2): [zeroth_linear, ..., variance_linear, zeroth_exp, ..., variance_exp, zeroth_sqrt, ..., variance_sqrt]
aggregated = {}
visited = set()

for node in V:
    visited.add(node)
    for adj in matrix_t[node].keys():
        if adj not in visited:
            # convert set of timestamps into set of weights according to formulas
            linear = weight_linear(matrix_t[node][adj])
            exponent = weight_exp(matrix_t[node][adj])
            square = weight_square(matrix_t[node][adj])

            res = aggregate(linear)
            res += aggregate(exponent)
            res += aggregate(square)

            aggregated[(node, adj)] = res

In [None]:
# test aggregated
print(len(aggregated[(1, 2)]))

In [None]:
# Temporal features with past event aggreagtion (II-A)
# Step C: weighted topological features

def get_aggregated(node, z, cat):
    return aggregated[(node, z)][cat] if node < z else aggregated[(z, node)][cat]

def AA_tmp(parent, child, commons, category):
    # parent is always smaller than its child, but z - ?
    res = 0

    for z in commons:
        num = get_aggregated(parent, z, category)
        num += get_aggregated(child, z, category)

        denum = 1
        for x in matrix[z]:
            denum += get_aggregated(z, x, category)

        res += num / np.log10(denum)

    return res

def CN_tmp(parent, child, commons, category):
    res = 0

    for z in commons:
        res += get_aggregated(parent, z, category)
        res += get_aggregated(child, z, category)

    return res

def JC_tmp(parent, child, commons, category):
    res = 0

    for z in commons:
        num = get_aggregated(parent, z, category)
        num += get_aggregated(child, z, category)

        denum = 0
        for x in matrix[parent]:
            denum += get_aggregated(parent, x, category)
        for x in matrix[child]:
            denum += get_aggregated(child, x, category)
        
        res += num / denum
    
    return res

def PA_tmp(parent, child, commons, category):
    ares = 0
    bres = 0

    for a in matrix[parent]:
        ares += get_aggregated(parent, a, category)
    for b in matrix[child]:
        bres += get_aggregated(child, b, category)
    
    return ares * bres

In [None]:
# Temporal features with past event aggreagtion (II-A)
# Step C: weighted topological features;
feature = {} # feature vector; contains 96 values

for parent, child in aggregated.keys():
    print(parent, child)
    feature[(parent, child)] = []
    for i in range(24):
        commons = matrix[parent].intersection(matrix[child])
        feature[(parent, child)].append(AA_tmp(parent, child, commons, i))
        feature[(parent, child)].append(CN_tmp(parent, child, commons, i))
        feature[(parent, child)].append(JC_tmp(parent, child, commons, i))
        feature[(parent, child)].append(PA_tmp(parent, child, commons, i))

In [None]:
# test output for feature vector
print(feature[(1,2)])