In [62]:
import pandas as pd
import numpy as np
import math 
import random

In [63]:
def bfs(adjList, start, l = math.inf):
    visited, queue = set(), [start]
    while queue:
        vertex = queue.pop(0)
        if(len(visited) < l):
          if vertex not in visited:
              visited.add(vertex)
              queue.extend(set(i['to'] for i in adjList[vertex]) - visited)
    return visited

## **Импорт данных**

In [64]:
from itertools import takewhile
filepath = './datasets/'
dataset = 'emails.csv'
headers = ['From',"To", 'weight', 'timestamp']

with open(filepath + dataset) as f:
  skip_rows = len(list(
      takewhile(lambda s: s.startswith("%"), f)
  ))

df = pd.read_csv(filepath + dataset, names=headers, skiprows=skip_rows, index_col=False, delimiter=r'\s+')\
.drop(columns=["weight"])\
.sort_values(by="timestamp")

# **Характеристики графа:**

## **Число вершин**

In [65]:
V_num = pd.unique(df[['From', 'To']].values.ravel()).size
print(V_num)

167


## **Число рёбер**

In [66]:
adjList = [[] for _ in range(V_num + 1)]
E_num = 0 # количество ребер
for line in df.itertuples():
  [From, To, time] = [line.From, line.To, line.timestamp]
  if From != To and To not in [i['to'] for i in adjList[From]]: 
    adjList[From].append({'to' : To, 'time': time})
    adjList[To].append({'to' : From, 'time': time})
    E_num += 1
print(E_num)
  

3250


## **Плотность**

In [67]:
density = E_num / (V_num*(V_num - 1) / 2)
print(density)

0.2344708174013419


## **Статистика по КСС**

In [68]:
Vertexes = set(range(1, V_num))
count = 0
max_size = 0
max_WCC = set()

while len(Vertexes) > 0:
  WCC = bfs(adjList ,Vertexes.pop())
  size = len(WCC)
  if size > max_size:
    max_size = size
    max_WCC = WCC
  count += 1
  Vertexes -= WCC

print("Количество компонент слабой связности: ", count)
print("Наибольшая КСС: ", max_WCC)
print("Мощность наибольшей КСС: ", max_size)
print("Отношение мощности наибольшей КСС к общему количеству вершин: ", max_size / V_num)


Количество компонент слабой связности:  1
Наибольшая КСС:  {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167}
Мощность наибольшей КСС:  167
Отношение мощности наибольшей КСС к общему количеству вершин:  1.0


## **Средний кластерный коэффициент**

In [69]:
def local_clustering_coefficient(adjList, u):
    #print("u =", u, end=": ")

    # сет соседей вершины u
    adj2u_set = set()
    for v_tuple in adjList[u]:
        to = v_tuple['to']
        adj2u_set.add(to)

    gamma_u = len(adj2u_set)
    #print("length of adj2u =", gamma_u, end=", ")
    if (gamma_u < 2):
        #print("Only one adj node => Return 0")
        return 0

    L_u = countEdgesInSet(adjList, adj2u_set)

    #print("L_u =", L_u, end=", ")

    Cl_u = 2 * L_u / (gamma_u * (gamma_u - 1))

    #print("Cl_u =", Cl_u)

    return Cl_u

# считает количество ребер между вершинами в сете
def countEdgesInSet(adjList, vertexes_set):
    sum = 0
    for node in vertexes_set:
        for neighbour_node in adjList[node]:
            to = neighbour_node['to'] # neighbour_node - это tuple(to, time)
            if to in vertexes_set:
                sum+=1
    return sum // 2

sum_Cl = 0
for vertex in max_WCC:
    sum_Cl+=local_clustering_coefficient(adjList, vertex)

average_Cl = sum_Cl / len(max_WCC)

print("Средний кластерный коэффициент для наибольшей КСС:", average_Cl)

Средний кластерный коэффициент для наибольшей КСС: 0.5918632085486949


## **Коэффициент корреляции Пирсона**

In [70]:
from statistics import mean

def pearson_correlation(adjList):
  X = []
  Y = []

  for i in range(1, len(adjList)):
    for j in adjList[i]:
      X.append(len(adjList[i]))
      Y.append(len(adjList[j['to']]))

  avgX = mean(X)
  avgY = mean(Y)

  XminusAvgX = [i - avgX for i in X]
  YminusAvgY = [i - avgY for i in Y]

  Z = [i*j for i,j in zip(XminusAvgX, YminusAvgY)]

  XminusAvgX_sq = [i**2 for i in XminusAvgX]
  YminusAvgY_sq = [i**2 for i in YminusAvgY]

  amounts = [sum(Z), sum(XminusAvgX_sq), sum(YminusAvgY_sq)]

  print(np.corrcoef(X, Y)[0][1])
  return amounts[0]/((amounts[1]*amounts[2])**0.5)

PC = pearson_correlation(adjList)

print("my_pearson_correlation:!", PC)

-0.29517729789302094
my_pearson_correlation:! -0.29517729789301744


## **Вычисление диаметра, радиуса, рассояний, 90-го персентиля**

In [126]:
def Perc(l, p):
  l.sort()
  if p < 100:
    return l[math.floor(len(l) * p / 100)]
  elif p == 100:
    return l[-1]
  return -1

#алгоритм Флойда-Уоршелла
def floyd_warshall(adjMat):
  n = len(adjMat)
  disM = adjMat.copy()
  for k in range(1, n):
    for i in range(1, n):
      for j in range(1, n):
        disM[i][j] = min(disM[i][j], disM[i][k] + disM[k][j]);
  return disM

def calcRadDimPerc(disM):
  rad = math.inf
  dim = 0
  dist = []
#Диаметр и радиус
  for i in range(1, n):
    max = 0
    for j in range(1, n):
      if disM != math.inf:
        if disM[i][j] > max:
          max = disM[i][j]
        dist.append(disM[i][j])
    if max != 0:
      if max < rad:
        rad = max
      if max > dim:
        dim = max
  perc90 = Perc(dist, 90)
  print('radius :', rad)
  print('diameter :', dim)
  print('perc90 :', perc90)
  return [rad, dim, perc90]

In [127]:
n = len(adjList)
#Матрица смежности
adjL = [[math.inf for i in range(n)] for _ in range(n)]
for i in range(n):
  adjL[i][i] = 0
  if i in max_WCC:
    for j in adjList[i]:
      if j['to'] in max_WCC:
        adjL[i][j['to']] = 1

disM = floyd_warshall(adjL)

## **Снефжный ком, случайные выборки**

In [73]:
#2a, 2b
def rand_sample_metrics(adjList, max_comp, size):
  if len(max_comp) > size:
    rand_sample = random.sample(max_comp, size)
  else:
    rand_sample = max_comp
  calcRadDimPerc(adjList, rand_sample)

def snowball(adjList, max_comp, size):
  if len(max_comp) > size:
    sample = bfs(adjList, random.sample(max_comp, 3), size)
  else:
    sample = max_comp
  calcRadDimPerc(adjList, sample)

## **Результаты по характеристикам**

In [20]:
print(
    f'Характеристика датасета {dataset}: \n'\
    f'Число вершин: {V_num} \n'
    f'Число рёбер: {E_num} \n'\
    f'Плотность: {density} \n'\
    f'Количество компонент слабой связности: {count} \n'\
    f'Наибольшая КСС: {max_WCC}'
    #...
)

Характеристика датасета emails.csv: 
Число вершин: 167 
Число рёбер: 3250 
Плотность: 0.2344708174013419 
Количество компонент слабой связности: 1 
Наибольшая КСС: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167}


# **Графики...**

# **Статические признаки**

In [216]:
# Common Neighbours (CN) - количество общих соседей у двух вершин
def common_neighbours(adjList, u, v):
    set_u = set(i['to'] for i in adjList[u])
    set_v = set(i['to'] for i in adjList[v])
    return len(set_u.intersection(set_v))

# Adamic-Adar (AA) - сумма обратных логарифмов степеней общих соседей
def adamic_adar(adjList, u, v):
    set_u = set(i['to'] for i in adjList[u])
    set_v = set(i['to'] for i in adjList[v])
    common = set_u.intersection(set_v)
    aa = 0
    for c in common:
        degree = len(adjList[c])
        if degree > 1:
            aa += 1 / math.log(degree)
    return aa

# Jaccard Coefficient (JC) - отношение числа общих соседей к числу уникальных соседей
def jaccard_coefficient(adjList, u, v):
    set_u = set(i['to'] for i in adjList[u])
    set_v = set(i['to'] for i in adjList[v])
    intersection = set_u.intersection(set_v)
    union = set_u.union(set_v)
    if len(union) == 0:
        return 0
    return len(intersection) / len(union)

# Preferential Attachment (PA) - произведение степеней двух вершин
def preferential_attachment(adjList, u, v):
    return len(adjList[u]) * len(adjList[v])



def get_static_feat(adjList, u, v):
  return [common_neighbours(adjList, u, v), adamic_adar(adjList, u, v), jaccard_coefficient(adjList, u, v), preferential_attachment(adjList, u, v)]

# Создаем пустой словарь для хранения признаков для каждой пары вершин
features = {}

# Итерируемся по всем парам вершин в наибольшей КСС
for u in max_WCC:
    for v in max_WCC:
        if u != v:
            # Проверяем, что еще не вычисляли признаки для этой пары вершин
            if (u, v) not in features and (v, u) not in features:
                # Считаем признаки для пары вершин
                cn = common_neighbours(adjList, u, v)
                aa = adamic_adar(adjList, u, v)
                jc = jaccard_coefficient(adjList, u, v)
                pa = preferential_attachment(adjList, u, v)
                # Сохраняем результаты в словаре
                features[(u, v)] = {'CN': cn, 'AA': aa, 'JC': jc, 'PA': pa}
    
# Выводим признаки для первых 10 пар вершин в словаре
for pair, feature in list(features.items())[:10]:
    print(pair, feature)

# на случайной паре вершин
n = random.randint(5, len(max_WCC))
randVertexes = random.sample(max_WCC, n)
u = random.choice(randVertexes)
v = random.choice(randVertexes)
print("u =", u)
print("v =", v)
print("CN =", common_neighbours(adjList, u, v))
print("AA =", adamic_adar(adjList, u, v))
print("JC =", jaccard_coefficient(adjList, u, v))
print("PA =", preferential_attachment(adjList, u, v))

(1, 2) {'CN': 67, 'AA': 16.885895339959788, 'JC': 0.5075757575757576, 'PA': 8970}
(1, 3) {'CN': 67, 'AA': 17.25814041726422, 'JC': 0.5114503816793893, 'PA': 8840}
(1, 4) {'CN': 64, 'AA': 16.228125237448655, 'JC': 0.48854961832061067, 'PA': 8450}
(1, 5) {'CN': 53, 'AA': 13.257775503821485, 'JC': 0.40458015267175573, 'PA': 7020}
(1, 6) {'CN': 47, 'AA': 12.28103918506532, 'JC': 0.35877862595419846, 'PA': 6240}
(1, 7) {'CN': 75, 'AA': 18.869077030932047, 'JC': 0.5725190839694656, 'PA': 9880}
(1, 8) {'CN': 56, 'AA': 13.9686378779357, 'JC': 0.42748091603053434, 'PA': 7410}
(1, 9) {'CN': 61, 'AA': 14.909584470403765, 'JC': 0.4621212121212121, 'PA': 8190}
(1, 10) {'CN': 60, 'AA': 15.184497124972571, 'JC': 0.4580152671755725, 'PA': 7930}
(1, 11) {'CN': 24, 'AA': 5.842502263248883, 'JC': 0.183206106870229, 'PA': 3250}
u = 162
v = 26
CN = 0
AA = 0
JC = 0.0
PA = 54


since Python 3.9 and will be removed in a subsequent version.
  randVertexes = random.sample(max_WCC, n)


In [217]:
#Вычисление весов (Temporal weighting) features(3) 
l = 0.2
t_min = df['timestamp'].min()
t_max = df['timestamp'].max()
delta_t = t_min + (t_max - t_min) * 0.66

def get_sub_f(t):
  return (t - t_min) / delta_t

def get_w_lin(l, t):
  sub_f = get_sub_f(t)
  return l + (1 - l) * sub_f

def get_w_exp(l, t):
  sub_f = get_sub_f(t)
  return l + (1 - l) * (math.exp(-3 * sub_f) - 1) / (math.pow(math.e, -3) - 1)

def get_w_sqrt(l, t):
  sub_f = get_sub_f(t)
  return l + (1 - l) * math.sqrt(sub_f) 

In [219]:
def get_zeroth(w):
  return Perc(w, 0)

def get_first(w):
  return Perc(w, 25)

def get_second(w):
  return Perc(w, 50)

def get_third(w):
  return Perc(w, 75)

def get_fourth(w):
  return Perc(w, 100)

def get_sum(w):
  return sum(w)

def get_mean(w):
  return mean(w)


def agg_node_activity(w):
  return [get_zeroth(w), get_first(w), get_second(w), get_third(w), get_fourth(w), get_sum(w), get_mean(w)]

In [254]:
def get_sum_1(a, b):
  return a + b
  
def get_abs_diff(a, b):
  return abs(a - b)
  
def get_min(a, b):
  return min(a, b)
  
def get_max(a, b):
  return max(a, b)

def combine(a, b):
  feat = []
  for a, b in zip(a, b):
    feat.extend([get_sum_1(a, b), get_abs_diff(a, b), get_min(a, b), get_max(a, b)])
  return feat

In [281]:
#node_activity
f_mat = {i: dict() for i in range(V_num + 1)}

for _, fr, to, timestamp in df.itertuples():
  if fr == to or timestamp > delta_t:
      continue
  if f_mat[fr].get(to): 
      f_mat[fr][to].append(timestamp)
  else:
      f_mat[fr][to] = [timestamp]
  
  if f_mat[to].get(fr): 
      f_mat[to][fr].append(timestamp)
  else:
      f_mat[to][fr] = [timestamp]
print(f_mat)

{0: {}, 1: {2: [1262454010, 1262604260, 1262604264, 1262645364, 1262734830, 1262734947, 1263245473, 1263292625, 1263292662, 1263849711, 1264454347, 1264977448, 1265059841, 1265146812, 1265146987, 1265666114, 1266269559, 1266578808, 1266579058, 1266873984, 1267426543, 1267428460, 1267517814, 1267518557, 1267569221, 1267569382, 1268090578, 1268722287, 1268726872, 1269297419, 1269897916, 1270072738, 1270160068, 1270245851, 1270245977, 1270587199, 1271106444, 1271255513, 1271255555, 1271711825, 1272320747, 1272922186, 1273009104, 1273096989, 1273097136, 1273527021, 1274130977, 1274737768, 1275342134, 1275432881, 1275515990, 1275516185, 1275945296, 1276551053, 1277156073, 1277288679, 1277288742, 1277764028], 17: [1262454010, 1262607309, 1262607362, 1262645364, 1262734830, 1263245473, 1263849711, 1264454347, 1264977448, 1265029014, 1265029031, 1265059841, 1265146812, 1265619305, 1265666114, 1265787984, 1266269559, 1266503961, 1266504256, 1266563635, 1266568063, 1266873984, 1267426543, 126744

In [282]:
def get_temporal_feat(l, t):
  return [get_w_lin(l, t), get_w_exp(l, t), get_w_sqrt(l, t)]

for x in f_mat.values():
  for to, times in x.items():
    for i, t in enumerate(times):
      x[to][i] = get_temporal_feat(l, t)

In [283]:
node_activity = {i: list() for i in range(V_num + 1)}

for fr, feat in f_mat.items():
  if len(feat) == 0:
    continue
  lin = []
  exp = []
  squar = []
  for to, fw in feat.items():
    lin.extend([w[0] for w in fw])
    exp.extend([w[1] for w in fw])
    squar.extend([w[2] for w in fw])
  for w in (lin, exp, squar):
    node_activity[fr].extend(agg_node_activity(w))

In [290]:
pairs = set()
for u in range(V_num + 1):
    u_adj = set(f_mat[u].keys())
    for v in range(V_num + 1):
        v_adj = set(f_mat[v].keys())
        if u not in f_mat[v].keys() and u_adj.intersection(v_adj):
            if u < v:
                pairs.add((u, v))

In [291]:
con_pairs = list()
non_pairs = list()

for i,j in pairs:
  if j in [k['to'] for k in adjList[i]]:
    con_pairs.append((i, j))
  else:
    non_pairs.append((i, j))
print(len(con_pairs), len(non_pairs))



260 7839


In [292]:
sample_size = 10000

x_t = random.choices(con_pairs, k=sample_size)
x_n = random.choices(non_pairs, k=sample_size)

x = list(x_t)
x.extend(x_n)
y = [1]*sample_size
y.extend([0]*sample_size)

In [293]:
static_feat = dict.fromkeys(x)
temp_feat = dict.fromkeys(x)

for u, v in static_feat.keys():
  static_feat[(u, v)] = get_static_feat(adjList, u, v)
  temp_feat[(u, v)] = combine(
      node_activity[u], node_activity[v]
  )
print(node_activity[42])
print(combine(node_activity[42], node_activity[58]))
print(static_feat)
print(temp_feat)

[0.20008430336625593, 0.2020366838396987, 0.20409168058965746, 0.20674465054555566, 0.2096666674712357, 275.7445342183029, 0.2044066228452956, 0.2002661194321436, 0.20640569881713888, 0.21281959892546515, 0.22102708951002747, 0.22997293610502392, 288.3505592915903, 0.21375134120948158, 0.2082123500293612, 0.2403651715190082, 0.25721314946518825, 0.27345556777021407, 0.2879393767148059, 344.3178190246483, 0.2552393024645278]
[0.4001692978565992, 6.911240873486157e-07, 0.20008430336625593, 0.20008499449034328, 0.40418227418673736, 0.0001089065073399953, 0.2020366838396987, 0.2021455903470387, 0.40764353066527814, 0.0005398305140368043, 0.20355185007562066, 0.20409168058965746, 0.41425154021939803, 0.0007622391282867125, 0.20674465054555566, 0.20750688967384237, 0.4193365677004307, 3.232757959298338e-06, 0.2096666674712357, 0.209669900229195, 428.52803754927174, 122.96103088733398, 152.7835033309689, 275.7445342183029, 0.4089360784423093, 0.00012283275171812158, 0.2044066228452956, 0.2045

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [153]:
from sklearn.model_selection import train_test_split

In [294]:
static_x = []
temp_x = []
for i in x:
  static_x.append(static_feat[i])
  temp_x.append(temp_feat[i])

x_static_train, x_static_test, y_static_train_labels, y_static_test_labels = train_test_split(static_x, y, test_size=0.25, random_state=42)
x_temp_train, x_temp_test, y_temp_train_labels, y_temp_test_labels = train_test_split(temp_x, y, test_size=0.25, random_state=42)

In [185]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [295]:
static_model = LogisticRegression(max_iter=10000)
static_model.fit(x_static_train, y_static_train_labels)
y_pred_static = static_model.predict(x_static_test)
print(metrics.accuracy_score(y_static_test_labels, y_pred_static))

0.7864


In [296]:
temp_model = LogisticRegression(max_iter=10000)
temp_model.fit(x_temp_train, y_temp_train_labels)
y_pred_temp = temp_model.predict(x_temp_test)
print(metrics.accuracy_score(y_temp_test_labels, y_pred_temp))

0.7276
