In [26]:
import pandas as pd
import numpy as np
import random

# Importing dataset

In [160]:
datasets = [
    "test_graphs/socfb-Middlebury45.txt",
    "test_graphs/socfb-Reed98.txt",
    "test_graphs/testgraph_1.txt",
    "test_graphs/testgraph_2.txt",
    "test_graphs/testgraph_3.txt",
    "test_graphs/testgraph_4.txt",
    "test_graphs/testgraph_5.txt",
    "test_graphs/testgraph_6.txt",
    "test_graphs/testgraph_7.txt",
    "test_graphs/team_11.txt"
]
dataset_names = [
    "mid",
    "reed",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "team"
]
current = 8 # 0 - 9
graph = pd.read_csv(
    datasets[current],
    names=["_from", "_to"],
    sep=" |\t",
    engine ='python',
)
V = np.unique(graph["_from"]._append(graph["_to"])).astype(int)
V.sort()
n = V.size
volume = graph["_from"].size
print(f"{n = }, {volume = }")


n = 19428, volume = 96662


# Tools

In [179]:
def write_results(lines: list[str]):
    """Writes results to .txt files.
    Args:
        number_of_task (int): 1 or 2.
        lines (list[str]): Lines to write.
    """
    with open(f"test_results/dataset_{dataset_names[current]}.txt", "w", encoding="utf-8") as f:
        f.writelines(line + '\n' for line in lines)

# Chapter 1

## Preparing static graph

In [162]:
graph_static = dict.fromkeys(V)
for u in V:
    graph_static[u] = set()
for _index, _from, _to in graph.itertuples():
    if _from == _to:
        continue
    if not _to in V:
        print(_from, _to)
    graph_static[_from].add(_to)
    graph_static[_to].add(_from)


## Task 1.1

### |E| and density

In [163]:
E_count = 0
for u in graph_static.keys():
    E_count += len(graph_static[u])
E_count //= 2
#print(E_count)

density = E_count * 2 / (n * (n - 1))
#print(density)

### Connectivity components

In [164]:
V_to_visit = set(V)
connectivity_components = []
while(V_to_visit):
    V_seen = set()
    queue = []
    for u in V_to_visit:
        queue.append(u)
        V_seen.add(u)
        break
    while queue:
        u = queue.pop()
        u_adjacent_to_visit = graph_static[u].difference(V_seen)
        for v in u_adjacent_to_visit:
            V_seen.add(v)
            queue.append(v)
    V_to_visit = V_to_visit.difference(V_seen)
    connectivity_components.append(V_seen)

sizes = list(map(lambda x: len(x), connectivity_components))
max_component_size = max(sizes)
max_connectivity_component_index = sizes.index(max_component_size)
proportion = max_component_size / len(V)
#print(f"{max_val = }, {max_connectivity_component_index = }, {proportion = }")


In [165]:
# 1.1
print("|V| = %i, |E| = %i, p = %f, number of components = %i, max component size = %i, max component proportion = %f"
      % (n, E_count, density, len(connectivity_components), max_component_size, proportion))


|V| = 19428, |E| = 96662, p = 0.000512, number of components = 23, max component size = 19365, max component proportion = 0.996757


## Task 1.2

### Diameter, radius, all distances

In [166]:
component = list(connectivity_components[max_connectivity_component_index])
distances = [0] * (n + 2)
diameter = 0
radius = n + 1
n_limit = 20000
if n < n_limit:
    for start in component:
        distances_tmp = dict.fromkeys(V, n + 1)
        V_visited = set()
        queue = [(start, 0)]
        queued = set([start])
        depth = 0
        u = start
        while queued:
            (u, depth) = queue.pop(0)
            queued.remove(u)
            V_visited.add(u)
            u_adjacent_to_visit = graph_static[u].difference(V_visited)
            for v in u_adjacent_to_visit:
                distances_tmp[v] = min(distances_tmp[v], depth + 1)
                if v not in queued:
                    queue.append((v, depth + 1))
                    queued.add(v)
        for u in V:
            if u > start:
                distances[distances_tmp[u]] += 1
        diameter = max(diameter, depth)
        radius = min(radius, depth)
        if not start % 10:
            #print(start, "/", n)
            pass
all_dist = 0
for i in range(diameter + 1):
    all_dist += distances[i]
print(all_dist, "=", max_component_size * (max_component_size - 1) // 2,
      all_dist == max_component_size * (max_component_size - 1) // 2)


187491930 = 187491930 True


### 90 percentille

In [167]:
#percentile_90 = np.percentile(all_distances, 90)
percentile_90_ind = int(0.9 * all_dist)
percentile_90 = 0
ind_tmp = 0
for i in range(diameter + 1):
    ind_tmp += distances[i]
    if ind_tmp >= percentile_90_ind:
        percentile_90 = i
        break

### Distance matrix for a set of vertices function definition

In [168]:
def calculate_matrix(vertices):

    vertices_list = list(vertices)

    distances = []
    distance_matrix = dict()
    for u in vertices_list:
        distance_matrix[u] = dict()
        for v in vertices_list:
            if u != v:
                distance_matrix[u][v] = n + 1

    for start in vertices_list:
        V_to_calculate = set(vertices)
        V_to_calculate.discard(start)
        V_visited = set()
        queue = [(start, 0)]
        queued = set([start])
        max_depth = 0
        while queued and V_to_calculate:
            u, depth = queue.pop(0)
            max_depth = max(max_depth, depth)
            queued.discard(u)
            V_visited.add(u)
            u_adjacent_to_visit = graph_static[u].difference(V_visited)
            for v in u_adjacent_to_visit:
                if v in V_to_calculate:
                    distance = distance_matrix[start][v]
                    if depth + 1 < distance:
                        distance_matrix[start][v] = depth + 1
                        distance_matrix[v][start] = depth + 1
                        V_to_calculate.discard(v)
                if v not in queued:
                    queue.append((v, depth + 1))
                    queued.add(v)
    
    for u in vertices_list:
        for v in vertices_list:
            if u > v:
                distances.append(distance_matrix[u][v])
    eccentricities = dict()
    for u in distance_matrix:
        eccentricities[u] = max(distance_matrix[u].values())

    return (distance_matrix, eccentricities, distances)


### Diameter, radius, percentille from vertices

In [169]:
component = list(connectivity_components[max_connectivity_component_index])
diameter_from_random_500 = 0
radius_from_random_500 = 0
percentile_90_from_random_500 = 0
if n >= 500:
    random_500_vertices = sorted(random.sample(component, 500))
    random_500_matrix, random_500_eccentricities, random_500_distances = calculate_matrix(
        random_500_vertices)
    diameter_from_random_500 = max(random_500_eccentricities.values())
    radius_from_random_500 = min(random_500_eccentricities.values())
    percentile_90_from_random_500 = np.percentile(random_500_distances, 90)
    print(f"{diameter_from_random_500 = }, {radius_from_random_500 = }, {percentile_90_from_random_500 = }")

def snowball(limit, component):
    vertices = set(random.sample(component, 2))
    while len(vertices) < limit:
        for v in vertices:
            if len(vertices) < limit:
                vertices = vertices.union(graph_static[v])
    return sorted(list(vertices))

diameter_from_snowball_500 = 0
radius_from_snowball_500 = 0
percentile_90_from_snowball_500 = 0
if n >= 500:
    snowball_500_vertices = snowball(500, component)
    snowball_500_matrix, snowball_500_eccentricities, snowball_500_distances = calculate_matrix(
        snowball_500_vertices)
    diameter_from_snowball_500 = max(snowball_500_eccentricities.values())
    radius_from_snowball_500 = min(snowball_500_eccentricities.values())
    percentile_90_from_snowball_500 = np.percentile(snowball_500_distances, 90)
    print(f"{diameter_from_snowball_500 = }, {radius_from_snowball_500 = }, {percentile_90_from_snowball_500 = }")


diameter_from_random_500 = 10, radius_from_random_500 = 6, percentile_90_from_random_500 = 6.0


In [170]:
# 1.2
print("diameter = %i, raduis = %i, percentile_90 = %i" 
      % (diameter, radius, percentile_90))

diameter = 11, raduis = 6, percentile_90 = 6


## Task 1.3

In [171]:
component = list(connectivity_components[max_connectivity_component_index])

Cl = dict()
for u in component:
    u_neighbors = graph_static[u]

    if len(u_neighbors) < 2:
        Cl[u] = 0
        continue

    Lu_doubled = 0
    for neighbor in u_neighbors:
        Lu_doubled += len(graph_static[neighbor].intersection(u_neighbors))
    Cl[u] = Lu_doubled / (len(u_neighbors) * (len(u_neighbors) - 1))

Cl_average = sum(Cl.values()) / len(Cl.values())

In [172]:
#1.3
print("Cl_average = %f" % (Cl_average))

Cl_average = 0.000000


## Task 1.4

In [173]:
R1 = 0
R2 = 0
R3 = 0
Re = 0
for u in V:
    ku = len(graph_static[u])
    R1 += ku
    R2 += ku**2
    R3 += ku**3
    for v in graph_static[u]:
        kv = len(graph_static[v])
        Re += ku * kv
degree_associativity = (Re * R1 - R2**2) / (R3 * R1 - R2**2)

In [174]:
print("Degree associativity = %f" % (degree_associativity))

Degree associativity = -0.191557


### Preparing data structures

In [175]:
t_matrix = dict.fromkeys(V)
for u in V:
    t_matrix[u] = dict()

for _index, _from, _to in graph.itertuples():
    if _from == _to:
        continue
    t_matrix[_from][_to] = 1
    t_matrix[_to][_from] = 1
    

### Static features functions definitions

In [176]:
def get_static_topological_features(u, v):
    gamma_u = set(t_matrix[u].keys())
    gamma_v = set(t_matrix[v].keys())
    
    intersection_u_v = gamma_u.intersection(gamma_v)
    
    CN_static = len(intersection_u_v)
    JC_static = CN_static / len(gamma_u.union(gamma_v))
    PA_static = len(gamma_u) * len(gamma_v)
    AA_static = sum(
        [1.0 / np.log(len(set(t_matrix[z].keys()))) for z in intersection_u_v]
    )
    return CN_static, AA_static, JC_static, PA_static

In [177]:
feature = get_static_topological_features(1, 2)

## Writing results down

In [180]:
lines = [
    "Task 1.1",
    f"Число вершин: |V| = {n}",
    f"Число ребер: |E| = {E_count}",
    f"Плотность: p = {density}",
    f"Число компонент слабой связности: {len(connectivity_components)}",
    f"Доля вершин в максимальной по мощности компоненте слабой связности: {proportion}",
    "\nTask 1.2",
    f"Оценки для 500 вершин (случайные вершины): {diameter_from_random_500 = }, {radius_from_random_500 = }, {percentile_90_from_random_500 = }",
    f"Оценки для 500 вершин (снежный ком): {diameter_from_snowball_500 = }, {radius_from_snowball_500 = }, {percentile_90_from_snowball_500 = }",
    f"Вычисленные значения: {diameter = }, {radius = }, {percentile_90 = }" if n < n_limit else "Для этого датасета диаметр, радиус и 90-ый перцентиль будут вычисляться долго",
    "\nTask 1.3",
    f"Cредний кластерный коэффициент сети: {Cl_average = }",
    "\nTask 1.4",
    f"Коэффициент корреляции Пирсона: {degree_associativity}",
    "\nTask 2",
    f"Feature Vector [CN AA JC PA]: {feature}"
]
write_results(lines)