In [1]:
import networkx as nx
import pandas as pd
import numpy as np


In [2]:
import os

path = "Data"
csv_list = os.listdir(path)

In [3]:
def read_csv(name:str)->pd.DataFrame:
    cols = ["id","author","title"]
    if name == "out-dblp_proceedings.csv":
        cols = ["id","editor","title"]
        
    df = pd.read_csv(
    "Data/" + name,
    delimiter=";",
    usecols=cols,
    nrows=500
    )
    df.name = name.split(".")[0]
    df.rename(columns={"editor":"author"}, inplace=True)
    return df


df_list = list()
for csv in csv_list:
    df_list.append(
        read_csv(csv)
    )
    df_list[-1].dropna(inplace = True)


In [4]:
def create_graph(df:pd.DataFrame)->nx.Graph:
    G = nx.Graph()
    for publication_id, row in df.iterrows():
        authors = row["author"].split("|")
        title = row["title"]
        G.add_node(publication_id, bipartite = 0, title=title, authors_counter = len(authors))
        for author in authors:
            G.add_node(author, bipartite = 1)
            G.add_edge(publication_id,author)
    return G

graph_list = list()
for df in df_list:
    graph_list.append(
        create_graph(df)
    )

In [5]:
# Pubblicazione con maggior numero di autori:

def get_publication_with_max_authors(G:nx.Graph)->tuple[str, list[str], int]:
    publication_ids = {n for n, d in G.nodes(data=True) if d["bipartite"] == 0}
    publication_ids = list(publication_ids)

    authors_counter_array = np.array(
        list(map(lambda id: G.nodes[id]["authors_counter"], publication_ids))
        )

    max_authors_pubication_id = publication_ids[authors_counter_array.argmax()]

    authors = G.neighbors(max_authors_pubication_id)
    title = G.nodes[max_authors_pubication_id]["title"]

    return (
        title,
        list(authors),
        G.nodes[max_authors_pubication_id]['authors_counter']
    )

for idx, G in enumerate(graph_list):
    title, authors, counter = get_publication_with_max_authors(G)
    print(f"-------------Graph: {df_list[idx].name}--------------")
    print(f"{title} \n {list(authors)} \n wich has {counter} authors\n")


-------------Graph: out-dblp_article--------------
Bringing Semantics to Web Services with OWL-S. 
 ['David L. Martin 0001', 'Deborah L. McGuinness', 'Drew V. McDermott', 'Evren Sirin', 'Katia P. Sycara', 'Mark H. Burstein', 'Massimo Paolucci 0001', 'Naveen Srinivasan', 'Sheila A. McIlraith'] 
 wich has 9 authors

-------------Graph: out-dblp_book--------------
A Framework of Information System Concepts (The FRISCO Report). 
 ['Alexander A. Verrijn-Stuart', 'Björn E. Nilsson', 'Colette Rolland', 'Eckhard D. Falkenberg', 'Frans Van Assche', 'J. L. Han Oei', 'Klaus Voss', 'Paul Lindgreen', 'Ronald K. Stamper', 'Wolfgang Hesse'] 
 wich has 10 authors

-------------Graph: out-dblp_incollection--------------
CoolEmAll: Models and Tools for Planning and Operating Energy Efficient Data Centres. 
 ['Andrew Donoghue', 'Ariel Oleksiak', 'Daniel Rathgeb', 'Enric Pages', 'Eugen Volk', 'Georges Da Costa', 'Georgina Gallizo', 'Jaume Salom', 'Jean-Marc Pierson', 'Jochen Buchholz', 'Lara Lopez', 'Laur

In [6]:
# Autore con maggior numero di collaboratori

def get_author_with_most_collaborotors(G:nx.Graph)->tuple:
    authors = {n for n, d in G.nodes(data=True) if d["bipartite"] == 1}
    authors = list(authors)

    max = {"author": "None","collaborators":list()}
    for author in authors:
        collaborators = set()
        publication_ids = [publication_id[1] for publication_id in list(G.edges(author))]
        for publication_id in publication_ids:
            collaborators.update(
                [publication_id[1] for publication_id in list(G.edges(publication_id))]
            )
        collaborators = list(collaborators)
        collaborators.remove(author)
        if len(collaborators)  > len(max["collaborators"]):
            max["author"] = author
            max["collaborators"] = collaborators
    return(
        max['author'],
        len(max['collaborators']),
        max['collaborators']
    )

for idx, G in enumerate(graph_list):
    author, count, collaborators = get_author_with_most_collaborotors(G)
    print(f"-------------Graph: {df_list[idx].name}--------------")
    print(f"The author with most collaborators is {author}, with {count} collaborators:\n{collaborators}\n")




-------------Graph: out-dblp_article--------------
The author with most collaborators is Jean-Michel Knippel, with 8 collaborators:
['Jules Nyssen', 'Jean-Luc Massat', 'Christophe Bisière', 'Edmond Bianco', 'Mohamed Tayeb Laskri', 'Clyde Chabot', 'Denis Iwanesko', 'Renaud Litré']

-------------Graph: out-dblp_book--------------
The author with most collaborators is Alexander A. Verrijn-Stuart, with 9 collaborators:
['Ronald K. Stamper', 'Colette Rolland', 'Wolfgang Hesse', 'Eckhard D. Falkenberg', 'Klaus Voss', 'J. L. Han Oei', 'Paul Lindgreen', 'Frans Van Assche', 'Björn E. Nilsson']

-------------Graph: out-dblp_incollection--------------
The author with most collaborators is Tuncer I. Ören, with 37 collaborators:
['Umang Kant', 'Umut Durak', 'Laurent Capocchi', 'Yang Liu', 'Paul A. Fishwick', 'Lin Zhang 0009', 'Maâmar El-Amine Hamri', 'Alison Harper', 'Rhys Goldstein', 'Nico Formanek', 'Azam Khan', 'Valdemar Vicente Graciano Neto', 'Cláudio Gomes 0001', 'Thorsten Pawletta', 'Raymond

In [7]:
def get_largest_connected_component(G:nx.Graph)->nx.Graph:
    return G.subgraph(
    sorted(nx.connected_components(G), key = len, reverse=True)[0]
    ).copy()
    
def find_farther_node(G,starting_node:str)->list:
    edges = nx.bfs_edges(G,starting_node)
    edges = [starting_node] + [v for u, v in edges]
    return list(G.edges(edges[-1]))[0][0]

def two_sweep_path(G:nx.Graph,starting_node:str)->list:
    a = find_farther_node(G,starting_node)
    b = find_farther_node(G,a)
    return nx.shortest_path(G,a,b)

def calculate_starting_node(G: nx.Graph, method: str = "random") -> str:
    lcc = get_largest_connected_component(G)

    if method == "random":
        starting_node = list(lcc)[0]
    elif method == "2-sweep":
        starting_node = two_sweep_path(lcc, list(lcc)[0])
        median_idx = len(starting_node) // 2
        starting_node = starting_node[median_idx]
    else:
        raise ValueError("Metodo non valido. Usare 'random' o '2-sweep'.")

    return starting_node

In [15]:
# Calcolo del Diametro

def calculate_B_i(G:nx.Graph, nodes:dict, i:int):
    F = list()
    for key in nodes.keys():
        if nodes[key] == i:
            F.append(key)
    B_i = 0
    for node in F:
        max = nx.eccentricity(G, v=node)
        if max > B_i:
            B_i = max
    return B_i

def calculate_B_i_array(G,nx.Graph,i:int):
    nodes_ecc = nx.eccentricity(G)
    B_i_array = np.empty(shape=i, dtype=int)
    for idx in range(i):
        B_i_array[idx] = calculate_B_i(G,nodes,i)
    return B_i_array


def iFub(G:nx.Graph,node:str)-> int:
    lcc = get_largest_connected_component(G)
    i = nx.eccentricity(lcc,v=node)

    lb = i
    ub = 2*lb
    
    B_i = calculate_B_i_array(lcc,node,i)
    
    while ub > lb:
        B_i = calculate_B_i(lcc,node,i)
        max = np.max([lb,B_i])
        if max > 2*(i-1):
            return max
        else:
            lb = max
            ub = 2*(i-1)
        i=i-1
    return lb

for idx, G in enumerate(graph_list):
    diameter = iFub(G, calculate_starting_node(G,method="2-sweep"))
    #nx_diameter = nx.diameter(get_largest_connected_component(G))
    print(f"Graph {df_list[idx].name} has diameter: {diameter}")
    #print(f"NetworkX diameter is {nx_diameter}\n")



Graph out-dblp_article has diameter: 4
Graph out-dblp_book has diameter: 1
Graph out-dblp_incollection has diameter: 4
Graph out-dblp_inproceedings has diameter: 5
Graph out-dblp_mastersthesis has diameter: 1
Graph out-dblp_phdthesis has diameter: 1
Graph out-dblp_proceedings has diameter: 2


In [None]:
def concatenate_DataFrame_from_list(df_list:list[pd.DataFrame])->pd.DataFrame:
    df = pd.concat(
        df_list,
        axis=0,
        ignore_index=True
    )
    df.drop_duplicates(
        subset='title',
        keep='first',
        inplace=True
    )
    return df

def build_union_graph_from_DataFrame_list(df_list:list[pd.DataFrame])->nx.Graph:
    return create_graph(
        concatenate_DataFrame_from_list(df_list)
    )



In [None]:
G = build_union_graph_from_DataFrame_list(df_list)

In [None]:
def answer_to_all_main_questions(G:nx.Graph,name:str)->None:
    print(f"-------------Graph: {name}--------------\n")

    title, authors, counter = get_publication_with_max_authors(G)
    print("The publication with most authors is:")
    print(f"{title} \n wich has {counter} authors\n")

    diameter = iFub(G, calculate_starting_node(G))
    #nx_diameter = nx.diameter(get_largest_connected_component(G))
    print(f"The graph has exact diameter: {diameter}")
    #print(f"Diameter calculated with NetworkX is: {nx_diameter}\n")

    author, count, collaborators = get_author_with_most_collaborotors(G)
    print(f"The author with most collaborators is {author}, with {count} collaborators")

In [None]:
answer_to_all_main_questions(
    G,
    "Union"
)

In [None]:
import itertools

def build_authors_graph_from_DataFrame_list(df_list:list[pd.DataFrame])->nx.Graph:
    df = concatenate_DataFrame_from_list(df_list)
    df = df["author"]
    G = nx.Graph()
    for authors in df:
        authors_list = authors.split("|")


        
        for author_comb in itertools.combinations(authors_list,2):
            if G.has_edge(*author_comb):
                G[author_comb[0]][author_comb[1]]["weight"] += 1
            else:
                G.add_edge(*author_comb,weight = 1)
    return G

def find_most_collaborating_couple(G:nx.Graph)->tuple[str,str,dict[int]]:
    return  sorted(G.edges(data=True),key= lambda x: x[2]['weight'],reverse=True)[0]

author_1, author_2, weight = find_most_collaborating_couple(
    build_authors_graph_from_DataFrame_list(df_list)
    )

print(f"The most collaborating authors are {author_1} and {author_2} with {weight['weight']} collaborations togheter ")