# TP1 (Student version)

We can use the following libraries.

In [None]:
import matplotlib.pyplot as plt
import math
import sys
import json
path = "graphs/"
amazon = path + "com-amazon.ungraph.txt"
lj = path + "com-lj.ungraph.txt"
core = path + "email-Eu-core.txt"
ebel = path + "email_data_ebel.txt"

## Exercise 1: get things started

### Question 1

Create manually a few graphs (with approximately a dozen nodes) and store them in the format (for each line):

x y 

You will use them to test your codes.

### Question 2

Download the following graphs:

http://snap.stanford.edu/data/email-Eu-core.html

http://snap.stanford.edu/data/com-Amazon.html

http://snap.stanford.edu/data/com-LiveJournal.html

Also, download the graph email_data_ebel.txt from http://lioneltabourier.fr/teaching_en.html

All these graphs allow you to check the results of your programs.


### Question 3

Make a program which reads a graph from a text file and counts the number of nodes and edges in a graph (without storing it in memory). If a same link appears several times, it will be counted as many times.

In [None]:
def node_link(file_name):
    '''
    node_link(file_name)
    
    Returns the number of edges and the number of nodes in the graph stored in 'file_name'
        Parameters:
            file_name(string) : the name of the file storing the graph
        Returns:
            node_count, link_count (int, int) : a tuple containing the number of nodes and the number of links in 
            the graph
    '''
    node_set = set()
    node_count = 0
    link_count = 0
    with open(file_name, "r") as my_file:
        for line in my_file:
            if line[0] != "#": # supposing that comments start with a '#' symbol
                line = line.split() # supposing a space between two nodes
                node1 = int(line[0]) # supposing that nodes are numbers formated in file
                node2 = int(line[1])
                link_count += 1 # if a same link appears several times, it will be counted as many times
                if node1 not in node_set:
                    node_set.add(node1)
                    node_count += 1
                if node2 not in node_set:
                    node_set.add(node2)
                    node_count +=1  
    return node_count, link_count


In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    node_count, link_count = node_link(f"{core}") # should be 1005 nodes, 25571 links
    print(f"Node : {node_count}, Link : {link_count}")

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    node_count, link_count = node_link(f"{amazon}") # should be 334863 nodes , 925872 links
    print(f"Node : {node_count}, Link : {link_count}")

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    node_count, link_count = node_link(f"{lj}") # should be 3997962 nodes and 34681189 links
    print(f"Node : {node_count}, Link : {link_count}")

### Question 4

Make a program which counts the degree (i.e. the number of edges) of a node of a graph (without storing it in memory). If a same link appears several times, it will increase the degree by as many times. In case of a self-loop, the degree is increased once.

In [None]:
def degree(file_name, my_node):
    '''
    degree(file_name, my_node)
    
    Returns the degree of the node 'my_node' in the graph stored in 'file_name'
        Parameters:
            file_name (string) : the name of the file that stores the graph
            my_node (integer) : the node which degree will be computed
        Returns:
            my_degree (integer) : the degree of the node 'my_node'
    '''
    my_degree = 0
    with open(file_name, "r") as my_file:
        for line in my_file:
            if line[0] != "#":
                line = line.split()
                node1 = int(line[0])
                node2 = int(line[1])
                if node1 == my_node:
                    my_degree += 1
                if node2 != node1 and node2 == my_node: # self loop degree increased once and same link appearance counted
                    my_degree += 1
    return my_degree             


In [None]:
if __name__ == '__main__' and '__file__' not in globals() :
    my_degree = degree(core, 1) 
    print(f"Degree : {my_degree}")

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    my_degree = degree(amazon, 1) 
    print(f"Degree : {my_degree}")

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    my_degree = degree(lj, 1) 
    print(f"Degree : {my_degree}")

## Exercise 2: loading a graph in memory

### Question 5

Make a program which reads a graph from a text file and load it as a python dictionary of lists. 
This implementation of the adjacency list format will be the standard format that we will use to store a graph in this course.

In [None]:
def add_link(my_link, my_graph):
    '''
    add_link((node1, node2), my_graph)
    
    Add the link ('node1', 'node2') in the graph 'my_graph'
        Parameters:
            (node1, node2) (int, int) : the link  that will be added
            my_graph (dictionary of lists) : the graph that will contain the link
        Returns:    
    '''
    node1, node2 = my_link
    if node1 in my_graph:
        my_graph[node1].append(node2)
    else:
        my_graph[node1] = [node2]
    if node2 in my_graph:
        my_graph[node2].append(node1)
    else:
        my_graph[node2] = [node1]
    return
def load_graph(file_name):
    '''
    load_graph(file_name)
    
    Load a graph from a text file in memory as a dictionary of lists
        Parameters:
            file_name (string) : the name of the file storing the graph
        Returns:
            my_graph (dictionary of lists) : the graph from 'file_name' stored in the memory as a dictionary of lists the keys of the dictionary are the nodes and the values are the neighbour of the nodes
    '''
    my_graph = {}
    with open(file_name, "r") as my_file:
        for line in my_file:
            if line[0] != "#":
                line = line.split()
                node1 = int(line[0])
                node2 = int(line[1])
                add_link((node1, node2), my_graph)
    return my_graph        
                
    

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    core_graph = load_graph(core)
    print(json.dumps(core_graph, indent = 4))

### Question 6

Make a program that deletes the self-loops and duplicated edges existing in the graph and writes it in a new text file. 

In [None]:
def load_clean_graph(input_name, output_name = None):
    '''
    load_clean_graph(input_name, output_name = None)
    
    Delete self-loops and duplicated edges existing in the graph and writes it in a new test file"
        Parameters:
            input_name (string) : name of the file storing the graph with self loops and duplicated edges
            output_name (string) (optional) : name of the file that will store the graph without self 
            loops and duplicated edges. If no output is defined the new graph will not be stored in a file
        Returns:
            my_graph (dictionary of lists) : the graph without self loops and duplicated edges
    '''
    my_graph = {}
    if output_name is not None:
        output_file = open(output_name, "w")
    with open(input_name, "r") as input_file:
        for line in input_file:

            if line[0] != "#":
                line = line.split()
                node1 = int(line[0])
                node2 = int(line[1])
                if node1 != node2:
                    if node1 in my_graph:
                        if node2 not in my_graph[node1]:
                            my_graph[node1].append(node2)
                            if output_name is not None:
                                output_file.write(f"{node1} {node2}\n")
                    else:
                        my_graph[node1] = [node2]
                    if node2 in my_graph:
                        if node1 not in my_graph[node2]:
                            my_graph[node2].append(node1)
                            if output_name is not None:
                                output_file.write(f"{node2} {node1}\n")
                    else:
                        my_graph[node2] = [node1]
    if output_name is not None:
        output_file.close()
    return my_graph
    

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    print(json.dumps(load_clean_graph(core), indent = 4))

### Question 7

Try the data structure of Question 5 on various graphs downloaded in Question 2. Conclude on the scalability (what graph size can you handle with this data structure).

## Exercise 3: degree distribution

### Question 8
Create a program which computes the degree distribution of a graph, store it in a python dictionary of the form:

deg: number of occurrences

In [None]:
def degree_dist(my_graph):
    '''
    degree_dist(my_graph)
    
    Computes the degree distribution of a graph
        Parameters:
            my_graph (dictionary of lists)
        Returns:
            my_degree_dist (dictionary) : a dictionary wich key are the degrees that appear in the graph and 
            the values are the number of occurences
    '''
    my_degree_dist = {}
    for node in my_graph:
        node_degree = len(my_graph[node])
        if node_degree in my_degree_dist:
            my_degree_dist[node_degree] += 1
        else:
            my_degree_dist[node_degree] = 1
    return my_degree_dist

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    degree_dist(load_clean_graph(core))

### Question 9

Plot the degree distribution in log scale (using matplotlib for example).

In [None]:
def plot_degree_dist(my_graph, log = True, limits = (0.5, 10000, 0.5, 10000), my_label = "Label"):
    '''
    plot_degree_dist(my_graph)
    
    Plot the degree distribution in log scale
        Parameters:
            my_graph (dictionary of lists)
            log (boolean) (default = True): Plotting the graph with a axis in log scale
            limits (int, int, int, int) (default = (0.5,10000,0.5,10000)) : the limits of the plot with the following format (xmin, xmax, ymin, ymax)
        Returns
    '''
    my_degree_dist = degree_dist(my_graph)
    xmin, xmax, ymin, ymax = limits
    if log:
        plt.xscale('log')
        plt.yscale('log')
        plt.xlim([xmin, xmax])
        plt.ylim([ymin, ymax])
    
    plt.scatter(my_degree_dist.keys(), my_degree_dist.values(), label = my_label)
    plt.legend()

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():

    plot_degree_dist(load_clean_graph(ebel), True)

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():
    plot_degree_dist(load_clean_graph(core))

In [None]:
if __name__ == '__main__'  and '__file__' not in globals():

    plot_degree_dist(load_clean_graph(amazon) )