In [None]:
import pandas as pd
import numpy as np
import math

# Importing dataset

In [None]:
datasets = [
    "data/opsahl-ucsocial/out.opsahl-ucsocial",
    "data/soc-sign-bitcoinalpha/out.soc-sign-bitcoinalpha",
    "data/soc-sign-bitcoinotc/out.soc-sign-bitcoinotc",
    "data/digg-friends/out.digg-friends",
    "data/prosper-loans/out.prosper-loans"]
skiprows = [[0, 1], [0], [0], [0], [0]]
current = 0 # 3 is big n (n = 270000), 4 is big volume (n = 90000)
graph = pd.read_csv(
    datasets[current],
    names=["_from", "_to", "_weight", "_timestamp"],
    sep=" |\t",
    engine ='python',
    skiprows=skiprows[current]
)
# print(graph.dtypes)
# print(graph.head())

In [None]:
V = np.unique(graph["_from"]._append(graph["_to"]))
n = V.size
volume = graph["_timestamp"].size
print(f"{n = }, {volume = }")

# Chapter 1

## Preparing static graph

In [None]:
graph_static = [set() for _ in range(n + 1)]
for _index, _from, _to, _weight, _timestamp in graph.itertuples():
    if _from == _to:
        continue
    graph_static[_from].add(_to)
    graph_static[_to].add(_from)


## Task 1.1

In [None]:
E_count = 0
for i in range(len(graph_static)):
    E_count += len(graph_static[i])
E_count //= 2
#print(E_count)

density = E_count * 2 / (n * (n - 1))
#print(density)

In [None]:
V_to_visit = set(V)
connectivity_components = []
while(V_to_visit):
    V_seen = set()
    queue = []
    for u in V_to_visit:
        queue.append(u)
        V_seen.add(u)
        break
    while queue:
        u = queue.pop()
        u_adjacent_to_visit = graph_static[u].difference(V_seen)
        for v in u_adjacent_to_visit:
            V_seen.add(v)
            queue.append(v)
    V_to_visit = V_to_visit.difference(V_seen)
    connectivity_components.append(V_seen)

sizes = list(map(lambda x: len(x), connectivity_components))
max_component_size = max(sizes)
max_connectivity_component_index = sizes.index(max_component_size)
proportion = max_component_size / len(V)
#print(f"{max_val = }, {max_connectivity_component_index = }, {proportion = }")


In [None]:
# 1.1
print("|V| = %i, |E| = %i, p = %f, number of components = %i, max component size = %i, max component proportion= % f"
      % (n, E_count, density, len(connectivity_components), max_component_size, proportion))


## Task 1.2

In [None]:
component = list(connectivity_components[max_connectivity_component_index])
distances = [0] * (n + 2)
diameter = 0
radius = n + 1
distances_tmp = np.empty(n + 1, dtype=int)
if(n < 10000):
    for start in component:
        distances_tmp.fill(n + 1)
        V_visited = set()
        queue = [(start, 0)]
        queued = set([start])
        depth = 0
        u = start
        while queued:
            (u, depth) = queue.pop(0)
            queued.remove(u)
            V_visited.add(u)
            u_adjacent_to_visit = graph_static[u].difference(V_visited)
            for v in u_adjacent_to_visit:
                distances_tmp[v] = min(distances_tmp[v], depth + 1)
                if v not in queued:
                    queue.append((v, depth + 1))
                    queued.add(v)
        for d in range(start, n + 1):
            distances[distances_tmp[d]] += 1
        diameter = max(diameter, depth)
        radius = min(radius, depth)
        if not start % 10:
            # print(start, "/", n)
            pass
#print(distances)
all_dist = 0
for i in range(diameter + 1):
    all_dist += distances[i]
print(all_dist, "=", max_component_size * (max_component_size - 1) // 2,
      all_dist == max_component_size * (max_component_size - 1) // 2)


In [None]:
#percentile_90 = np.percentile(all_distances, 90)
percentile_90_ind = int(0.9 * all_dist)
percentile_90 = 0
ind_tmp = 0
for i in range(diameter + 1):
    ind_tmp += distances[i]
    if ind_tmp >= percentile_90_ind:
        percentile_90 = i
        break

In [None]:
def calculate_matrix(vertices):

    vertices_list = list(vertices)
    vertices_set = set(vertices)

    distances = []
    distance_matrix = dict()
    for u in vertices_list:
        distance_matrix[u] = dict()
        for v in vertices_list:
            if u != v:
                distance_matrix[u][v] = n + 1

    for start in vertices_list:
        V_to_calculate = set(vertices)
        V_to_calculate.discard(start)
        V_visited = set()
        queue = [(start, 0)]
        queued = set([start])
        max_depth = 0
        while queued and V_to_calculate:
            u, depth = queue.pop(0)
            max_depth = max(max_depth, depth)
            queued.discard(u)
            V_visited.add(u)
            u_adjacent_to_visit = graph_static[u].difference(V_visited)
            for v in u_adjacent_to_visit:
                if v in V_to_calculate:
                    distance = distance_matrix[start][v]
                    if depth + 1 < distance:
                        distance_matrix[start][v] = depth + 1
                        distance_matrix[v][start] = depth + 1
                        V_to_calculate.discard(v)
                if v not in queued:
                    queue.append((v, depth + 1))
                    queued.add(v)
    
    for u in vertices_list:
        for v in vertices_list:
            if u > v:
                distances.append(distance_matrix[u][v])
    eccentricities = dict()
    for u in distance_matrix:
        eccentricities[u] = max(distance_matrix[u].values())

    return (distance_matrix, eccentricities, distances)


In [None]:
import random

component = list(connectivity_components[max_connectivity_component_index])

random_500_vertices = sorted(random.sample(component, 500))
random_1000_vertices = sorted(random.sample(component, 1000))

random_500_matrix, random_500_eccentricities, random_500_distances = calculate_matrix(
    random_500_vertices)
random_1000_matrix, random_1000_eccentricities, random_1000_distances = calculate_matrix(
    random_1000_vertices)

diameter_from_random_500 = max(random_500_eccentricities.values())
radius_from_random_500 = min(random_500_eccentricities.values())
percentile_90_from_random_500 = np.percentile(random_500_distances, 90)

print(f"{diameter_from_random_500 = }, {radius_from_random_500 = }, {percentile_90_from_random_500 = }")

diameter_from_random_1000 = max(random_1000_eccentricities.values())
radius_from_random_1000 = min(random_1000_eccentricities.values())
percentile_90_from_random_1000 = np.percentile(random_1000_distances, 90)

print(f"{diameter_from_random_1000 = }, {radius_from_random_1000 = }, {percentile_90_from_random_1000 = }")

                    

In [None]:
def snowball(limit):
    vertices = {component[0], component[1]}
    while len(vertices) < limit:
        for v in vertices:
            if len(vertices) < limit:
                vertices = vertices.union(graph_static[v])
    return sorted(list(vertices))

snowball_500_vertices = snowball(500)
snowball_1000_vertices = snowball(1000)

snowball_500_matrix, snowball_500_eccentricities, snowball_500_distances = calculate_matrix(
    snowball_500_vertices)
snowball_1000_matrix, snowball_1000_eccentricities, snowball_1000_distances = calculate_matrix(
    snowball_1000_vertices)

diameter_from_snowball_500 = max(snowball_500_eccentricities.values())
radius_from_snowball_500 = min(snowball_500_eccentricities.values())
percentile_90_from_snowball_500 = np.percentile(snowball_500_distances, 90)

print(f"{diameter_from_snowball_500 = }, {radius_from_snowball_500 = }, {percentile_90_from_snowball_500 = }")

diameter_from_snowball_1000 = max(snowball_1000_eccentricities.values())
radius_from_snowball_1000 = min(snowball_1000_eccentricities.values())
percentile_90_from_snowball_1000 = np.percentile(snowball_1000_distances, 90)

print(f"{diameter_from_snowball_1000 = }, {radius_from_snowball_1000 = }, {percentile_90_from_snowball_1000 = }")


In [None]:
# 1.2
print("diameter = %i, raduis = %i, percentile_90 = %i" 
      % (diameter, radius, percentile_90))

## Task 1.3

In [None]:
component = list(connectivity_components[max_connectivity_component_index])

Cl = dict()
for u in component:
    u_neighbors = graph_static[u]

    if len(u_neighbors) < 2:
        Cl[u] = 0
        continue

    Lu_doubled = 0
    for neighbor in u_neighbors:
        Lu_doubled += len(graph_static[neighbor].intersection(u_neighbors))
    Cl[u] = Lu_doubled / (len(u_neighbors) * (len(u_neighbors) - 1))

Cl_average = sum(Cl.values()) / len(Cl.values())

In [None]:
#1.3
print("Cl_average = %f" % (Cl_average))

## Task 1.4

In [None]:
R1 = 0
R2 = 0
R3 = 0
Re = 0
for i in range(1, n + 1):
    ki = len(graph_static[i])
    R1 += ki
    R2 += ki**2
    R3 += ki**3
    for j in graph_static[i]:
        kj = len(graph_static[j])
        Re += ki * kj
degree_associativity = (Re * R1 - R2**2) / (R3 * R1 - R2**2)

In [None]:
print("Degree associativity = %f" % (degree_associativity))

# Chapter 2

## Static topological features

In [None]:
def get_static_topological_features():
    CN_static = {}
    AA_static = {}
    JC_static = {}
    PA_static = {}
    
    visited = set()
    for u in V:
        visited.add(u)
        for v in graph_static[u]:
            if v in visited:
                continue
            gamma_u = graph_static[u]
            gamma_v = graph_static[v]
            
            intersection_u_v = gamma_u.intersection(gamma_v)
            CN_static[(u, v)] = len(intersection_u_v)
            JC_static[(u, v)] = CN_static[(u, v)] / len(gamma_u.union(gamma_v))
            PA_static[(u, v)] = len(gamma_u) * len(gamma_v)
            AA_static[(u, v)] = sum([1.0 / np.log(len(graph_static[z])) for z in intersection_u_v])
    return CN_static, AA_static, JC_static, PA_static

CN_static, AA_static, JC_static, PA_static = get_static_topological_features()

## Node activity features

In [None]:
t_min = graph["_timestamp"].min()
t_max = graph["_timestamp"].max()

In [None]:
def get_temporal_weighting(l, t):
    """
    Node acticvity features: STEP 1
    """
    time_var = (t - t_min) / (t_max - t_min)
    w_linear = l + (1 - l) * time_var
    w_exponential = l + (1 - l) * (np.exp(3 * time_var) - 1) / (np.exp(3) - 1)
    w_square_root = l + (1 - l) * np.sqrt(time_var)
    return w_linear, w_exponential, w_square_root
    

In [None]:
class AggregationOfNodeActivity:
    """
    Node acticvity features: STEP 2
    """ 
    @staticmethod
    def get_weights() -> list[set]:
        """
        List with the sets of weights from all edges adjacent to the node.
        """
        weights = [set() for _ in range(n + 1)]
        for _index, _from, _to, _weight, _timestamp in graph.itertuples():
            if _from == _to:
                continue
            weights[_from].add(_weight)
            weights[_to].add(_weight)
        return weights
    
    @staticmethod
    def zeroth_quantile(weights):
        return np.quantile(weights, 0)
    
    @staticmethod
    def first_quantile(weights):
        return np.quantile(weights, 0.25)
    
    @staticmethod
    def second_quantile(weights):
        return np.quantile(weights, 0.50)
    
    @staticmethod
    def third_quantile(weights):
        return np.quantile(weights, 0.75)
    
    @staticmethod
    def fourth_quantile(weights):
        return np.quantile(weights, 1)
    
    @staticmethod
    def get_sum(weights):
        return sum(weights)
    
    @staticmethod
    def get_mean(weights):
        return np.mean(weights)
        

In [None]:
class CombiningNodeActivity:
    """
    Node acticvity features: STEP 3
    """ 
    @staticmethod
    def get_sum(a, b):
        return a + b
    
    @staticmethod
    def get_absolute_differrence(a, b):
        return math.abs(a - b)
    
    @staticmethod
    def get_minimum(a, b):
        return min(a, b)
    
    @staticmethod
    def get_maximum(a, b):
        return max(a, b)

In [None]:
class AggregationOfNodeActivity:
    """
    Node acticvity features: STEP 2
    """ 
    @staticmethod
    def get_weights() -> list[set]:
        """
        List with the sets of weights from all edges adjacent to the node.
        """
        weights = [set() for _ in range(n + 1)]
        for _index, _from, _to, _weight, _timestamp in graph.itertuples():
            if _from == _to:
                continue
            weights[_from].add(_weight)
            weights[_to].add(_weight)
        return weights
    
    @staticmethod
    def zeroth_quantile(weights):
        return np.quantile(weights, 0)
    
    @staticmethod
    def first_quantile(weights):
        return np.quantile(weights, 0.25)
    
    @staticmethod
    def second_quantile(weights):
        return np.quantile(weights, 0.50)
    
    @staticmethod
    def third_quantile(weights):
        return np.quantile(weights, 0.75)
    
    @staticmethod
    def fourth_quantile(weights):
        return np.quantile(weights, 1)
    
    @staticmethod
    def get_sum(weights):
        return sum(weights)
    
    @staticmethod
    def get_mean(weights):
        return np.mean(weights)
        

In [None]:
class CombiningNodeActivity:
    """
    Node acticvity features: STEP 3
    """ 
    @staticmethod
    def get_sum(a, b):
        return a + b
    
    @staticmethod
    def get_absolute_differrence(a, b):
        return abs(a - b)
    
    @staticmethod
    def get_minimum(a, b):
        return min(a, b)
    
    @staticmethod
    def get_maximum(a, b):
        return max(a, b)

In [None]:
class CombiningNodeActivity:
    """
    Node acticvity features: STEP 3
    """ 
    @staticmethod
    def get_sum(a, b):
        return a + b
    
    @staticmethod
    def get_absolute_differrence(a, b):
        return abs(a - b)
    
    @staticmethod
    def get_minimum(a, b):
        return min(a, b)
    
    @staticmethod
    def get_maximum(a, b):
        return max(a, b)