# Lab2 (Student version)

In [None]:
import random
import matplotlib.pyplot as plt
import sys
import time

Download the three following graphs:
- http://snap.stanford.edu/data/email-Eu-core.html
- http://snap.stanford.edu/data/com-Amazon.html
- http://snap.stanford.edu/data/com-LiveJournal.html

It is also useful to consider some toy graphs (e.g. manually created graphs with a dozen nodes) to test your programs.

## Exercise 0: preliminaries

Using the codes of Lab1, load the graphs in memory as dictionary of lists and check their number of nodes and links.

In [None]:
def remove_loop_dupes(graph):
    for node in graph:
        graph[node] = list(dict.fromkeys(graph[node]))
        try:
            graph[node].remove(node)
        except ValueError:
            pass

def graph_from_file(file_name):
    graph = {}
    with open(file_name, "r") as graph_file:
        for line in graph_file:
            try:
                node1, node2 = [int(node) for node in line.split()]
                if node1 not in graph:
                    graph[node1] = []
                graph[node1].append(node2)
                if node2 not in graph:
                    graph[node2]= []
                graph[node2].append(node1)
            except:
                pass
    remove_loop_dupes(graph)
    return graph


def graph_to_file(graph, file_name):
    with open(file_name, "w") as graph_file:
        for node1 in graph:
            for node2 in graph[node1]:
                graph_file.write("{} {}\n".format(node1, node2))
def count_links(graph):
    link_count = 0
    for node in graph:
        link_count += len(graph[node])
    return link_count // 2
def compute_degree_dist(graph):
    degree_dist = {}
    for node in graph:
        degree = len(graph[node])
        if degree not in degree_dist:
            degree_dist[degree] = 0
        degree_dist[degree] += 1
    return degree_dist

In [None]:
email_graph = graph_from_file("res/email-Eu-core.txt")
amazon_graph = graph_from_file("res/com-amazon.ungraph.txt")
# lj_graph = graph_from_file("res/com-lj.ungraph.txt")

## Exercise 1: BFS

### 1.1 Components

- Implement a BFS algorithm.  

- Use it on each of the graphs to evaluate the size of the largest connected component of these graphs.

Warning: if your BFS is not well coded, it can be very long, so if it doesn't work on Amazon or LiveJournal in less than a few minutes, either improve your code, or test only on smaller graphs. 

In [None]:
def bfs(graph, node_start):
    queue = [node_start]
    marked = [node_start]
    while queue:
        node1 = queue.pop(0)
        for node2 in graph[node1]:
            if node2 not in marked:
                queue.append(node2)
                marked.append(node2)
    return marked

def compute_size_lcc(graph):
    nodes_cc_index = {}
    cc_index = 0
    cc_sizes = []
    for node in graph:
        nodes_cc_index[node] = -1
    for node in graph:
        if nodes_cc_index[node] == -1:
            cc = bfs(graph, node)
            cc_sizes.append(len(cc))
            for node_marked in cc:
                nodes_cc_index[node_marked] = cc_index
            cc_index += 1
    return max(cc_sizes)
    

In [None]:
email_graph = graph_from_file("res/email-Eu-core.txt")
compute_size_lcc(email_graph)

### 1.2 Distances

- Modify the BFS above to have it compute the distance to the source node.

- Using the fact that the diameter is necessarily larger than any distance measured, use your distance computation code to get a lower bound of the diameter. The higher the bound, the better.

In [None]:
def compute_distance(graph, node_start):
    queue = [node_start]
    distances = {}
    for node in graph:
        distances[node] = -1
    distances[node_start] = 0
    while queue:
        node1 = queue.pop(0)
        for node2 in graph[node1]:
            if distances[node2] == -1:
                queue.append(node2)
                distances[node2] = distances[node1] + 1
    return distances

def compute_diameter(graph, sample_size=10):
    nodes_start = random.choices(list(graph.keys()), k=sample_size)
    return max([max(compute_distance(graph, node_start).values()) for node_start in nodes_start])


In [None]:
email_graph = graph_from_file("res/email-Eu-core.txt")
compute_diameter(email_graph, sample_size=100)

## Exercise 2: Triangles

### 2.1 Raw triangle counting

- Implement a triangle counting algorithm. 

- Test your program on the 3 graphs and report the number of triangles as well as the running time of your program.

In [None]:
def count_triangle(graph):
    triangle_count = 0
    for node1 in graph:
        for node2 in graph[node1]:
            if node1 < node2:
                for node3 in graph[node1]:
                    if node3 in graph[node2] and node2 < node3:
                        triangle_count += 1
    return triangle_count

In [None]:
email_graph = graph_from_file("res/email-Eu-core.txt")
start_time = time.time()
count_triangle(email_graph)
print("{} seconds".format(time.time() - start_time))

### 2.2 Transitive ratio

Use this program to compute the transitive ratio of the graphs. Remember that the transitive ratio is defined as 
$$ \frac{3.number \ of \ triangles}{number \ of \ forks}$$
and that the number of forks (or connected triples) of a node of degree $d$ is simply $\frac{d(d-1)}{2}$.

In [None]:
def compute_transitive_ratio(graph):
    degree_dist = compute_degree_dist(graph)
    fork_count = 0
    for degree in degree_dist:
        fork_count += degree_dist[degree] * (degree * degree - 1) / 2
    return 3 * count_triangle(graph) / fork_count

In [None]:
email_graph = graph_from_file("res/email-Eu-core.txt")
compute_transitive_ratio(email_graph)