# Lab2 (Student version)

In [None]:
import random
import matplotlib.pyplot as plt
import sys
import time

In [None]:
path = "graphs/"
amazon = path + "com-amazon.ungraph.txt"
lj = path + "com-lj.ungraph.txt"
core = path + "email-Eu-core.txt"

Download the three following graphs:
- http://snap.stanford.edu/data/email-Eu-core.html
- http://snap.stanford.edu/data/com-Amazon.html
- http://snap.stanford.edu/data/com-LiveJournal.html

It is also useful to consider some toy graphs (e.g. manually created graphs with a dozen nodes) to test your programs.

## Exercise 0: preliminaries

Using the codes of Lab1, load the graphs in memory as dictionary of lists and check their number of nodes and links.

In [None]:
# Importing the functions of first TP
def node_link(file_name):
    '''
    node_link(file_name)
    
    Returns the number of edges and the number of nodes in the graph stored in 'file_name'
        Parameters:
            file_name(string) : the name of the file storing the graph
        Returns:
            node_count, link_count (int, int) : a tuple containing the number of nodes and the number of links in 
            the graph
    '''
    node_set = set()
    node_count = 0
    link_count = 0
    with open(file_name, "r") as my_file:
        for line in my_file:
            if line[0] != "#": # supposing that comments start with a '#' symbol
                line = line.split() # supposing a space between two nodes
                node1 = int(line[0]) # supposing that nodes are numbers formated in file
                node2 = int(line[1])
                link_count += 1 # if a same link appears several times, it will be counted as many times
                if node1 not in node_set:
                    node_set.add(node1)
                    node_count += 1
                if node2 not in node_set:
                    node_set.add(node2)
                    node_count +=1  
    return node_count, link_count
def load_clean_graph(input_name, output_name = None):
    '''
    load_clean_graph(input_name, output_name = None)
    
    Delete self-loops and duplicated edges existing in the graph and writes it in a new test file"
        Parameters:
            input_name (string) : name of the file storing the graph with self loops and duplicated edges
            output_name (string) (optional) : name of the file that will store the graph without self 
            loops and duplicated edges. If no output is defined the new graph will not be stored in a file
        Returns:
            my_graph (dictionary of lists) : the graph without self loops and duplicated edges
    '''
    my_graph = {}
    if output_name is not None:
        output_file = open(output_name, "w")
    with open(input_name, "r") as input_file:
        for line in input_file:

            if line[0] != "#":
                line = line.split()
                node1 = int(line[0])
                node2 = int(line[1])
                if node1 != node2:
                    if node1 in my_graph:
                        if node2 not in my_graph[node1]:
                            my_graph[node1].append(node2)
                            if output_name is not None:
                                output_file.write(f"{node1} {node2}\n")
                    else:
                        my_graph[node1] = [node2]
                    if node2 in my_graph:
                        if node1 not in my_graph[node2]:
                            my_graph[node2].append(node1)
                            if output_name is not None:
                                output_file.write(f"{node2} {node1}\n")
                    else:
                        my_graph[node2] = [node1]
    if output_name is not None:
        output_file.close()
    return my_graph

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    node_count, link_count = node_link(amazon)
    print("Amazon \t", f"Node : {node_count}, Link : {link_count}")
    node_count, link_count = node_link(core)
    print("Core \t", f"Node : {node_count}, Link : {link_count}")
    #count_node_edge(lj)
    amazon_graph = load_clean_graph(amazon)
    print("Amazon graph loaded")
    core_graph = load_clean_graph(core)
    print("Core graph loaded")

## Exercise 1: BFS

### 1.1 Components

- Implement a BFS algorithm.  

- Use it on each of the graphs to evaluate the size of the largest connected component of these graphs.

Warning: if your BFS is not well coded, it can be very long, so if it doesn't work on Amazon or LiveJournal in less than a few minutes, either improve your code, or test only on smaller graphs. 

In [None]:
def bfs(my_graph):
    '''
    bfs(my_graph)
    
    Evaluate the size of the largest connect component of the graph 'my_graph' by going through all nodes of the 
    graph and identifying the connected component that contains the node
        Parameters:
            my_graph (dictionary of lists)
        Returns:
            my_lcc : the size of the largest connected component of 'my_graph'
    '''
    my_cc = {}
    cc_index = 0
    for source in my_graph.keys():
        if source not in my_cc:
            my_queue = [source]
            marked_node = [source]
            while my_queue:
                node1 = my_queue.pop(0)
                my_cc[node1] = cc_index
                for node2 in my_graph[node1]:
                    if node2 not in marked_node:
                        my_queue.append(node2)
                        marked_node.append(node2)
            cc_index += 1
    
    cc_sizes = {}
    for node in my_cc:
        cc_index = my_cc[node]
        if cc_index in cc_sizes:
            cc_sizes[cc_index] += 1
        else:
            cc_sizes[cc_index] = 1
    return max(list(cc_sizes.values()))
    

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    print(bfs(core_graph))

In [None]:
 if __name__ == '__main__'  and '__file__' not in globals():

    bfs(amazon_graph) # takes too much time

### 1.2 Distances

- Modify the BFS above to have it compute the distance to the source node.

- Using the fact that the diameter is necessarily larger than any distance measured, use your distance computation code to get a lower bound of the diameter. The higher the bound, the better.

In [None]:
def distances(my_graph, source_node):
    '''
    distances(my_graph, source_node)
    
    Returns the distances of each node to the source node as a dictionary
        Parameters:
            my_graph (dictionary of lists)
            source_node (int) : the node from which we will compute the distances
        Returns:
            my_distances : A dictionary which keys are the nodes and the values are the distances from the 
            key to 'source_node'
    '''
    my_queue = [source_node]
    my_distances = {}
    for node in my_graph:
        my_distances[node] = -1
    my_distances[source_node] = 0
    while my_queue:
        node1 = my_queue.pop(0)
        for node2 in my_graph[node1]:
            if my_distances[node2] == -1:
                my_queue.append(node2)
                my_distances[node2] = my_distances[node1] + 1
    return my_distances

def diameter(my_graph, sample_size):
    '''
    diameter(my_graph, sample_size)
    
    Compute an approximate diameter of the  graph 'my_graph' by running a bfs algorithm on a sample of nodes 
    and taking the max of the distances
        Parameters:
            my_graph (dictionary of lists)
            sample_size (int) : the number of nodes in the graph that will be used to find the diameter
        Returns:
            my_diameter (int) : the approximate maximum distance in the graph
    '''
    my_diameter = -1
    nodes = list(my_graph.keys())
    for i in range(sample_size):
        my_distances = distances(my_graph, nodes.pop(random.randint(0,len(nodes) - 1)))
        my_diameter = max(my_diameter, max(list(my_distances.values())))
    return my_diameter

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    #print(json.dumps(compute_distance(core_graph, 0),indent = 4))
    print(f"The diameter of the graph 'Core' is {diameter(core_graph, 500)}")

## Exercise 2: Triangles

### 2.1 Raw triangle counting

- Implement a triangle counting algorithm. 

- Test your program on the 3 graphs and report the number of triangles as well as the running time of your program.

In [None]:
def triangle(my_graph):
    '''
    triangle(my_graph)
    
    Returns the number of triangles in the graph 'my_graph'
        Parameters:
            my_graph (dictionary of lists)
        Returns:
            triangle_count (int) : the number of triangles in the graph
    '''
    triangle_count = 0
    for node1 in my_graph:
        for node2 in my_graph[node1]:
            if node1 < node2:
                for node3 in my_graph[node1]:
                    if node3 in my_graph[node2]: # node3 in N(node1) and N(node2)
                        if node2 < node3:
                            triangle_count += 1
    return triangle_count

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    start_time = time.time()
    triangle_count = triangle(core_graph)
    end_time = time.time()
    print(f"Triangles : {triangle_count}")
    print(f"Running time : {end_time - start_time} seconds")

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    start_time = time.time()
    triangle_count = triangle(amazon_graph)
    end_time = time.time()
    print(f"Triangles : {triangle_count}")
    print(f"Running time : {end_time - start_time} seconds")

### 2.2 Transitive ratio

Use this program to compute the transitive ratio of the graphs. Remember that the transitive ratio is defined as 
$$ \frac{3.number \ of \ triangles}{number \ of \ forks}$$
and that the number of forks (or connected triples) of a node of degree $d$ is simply $\frac{d(d-1)}{2}$.

In [None]:
def trans_ratio(my_graph):
    '''
    trans_ratio(my_graph)
    
    Returns the transitive ratio of 'my_graph'
        Parameters:
            my_graph (dictionary of lists)
        Returns
            transitive_ratio (float) : the transitive ratio of the graph
    '''
    fork_count = 0
    for node in my_graph:
        degree_node = len(my_graph[node])
        fork_count += degree_node * (degree_node - 1) / 2
    triangle_count = count_triangle(my_graph)
    transitive_ratio = 3 * triangle_count / fork_count
    return transitive_ratio
    

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    trans_ratio(core_graph)

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    trans_ratio(amazon_graph)