In [1]:
import networkx as nx
import pandas as pd
import numpy as np

import os
os.environ["MODIN_ENGINE"] = "ray"
import modin.pandas as pd
import ray
ray.shutdown()
ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})


2023-11-06 10:38:44,160	INFO worker.py:1633 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.6
Ray version:,2.7.1
Dashboard:,http://127.0.0.1:8265


In [2]:
import os

path = "Data"
csv_list = os.listdir(path)

In [34]:
def read_csv(name:str)->pd.DataFrame:
    cols = ["id","author","title"]
    if name == "out-dblp_proceedings.csv":
        cols = ["id","editor","title"]
        
    df = pd.read_csv(
    "Data/" + name,
    delimiter=";",
    usecols=cols,
    nrows=500
    )
    df.name = name.split(".")[0]
    df.rename(columns={"editor":"author"}, inplace=True)
    return df


df_list = list()
for csv in csv_list:
    df_list.append(
        read_csv(csv)
    )
    df_list[-1].dropna(inplace = True)


In [4]:
def create_graph(df:pd.DataFrame)->nx.Graph:
    G = nx.Graph()
    for publication_id, row in df.iterrows():
        authors = row["author"].split("|")
        title = row["title"]
        G.add_node(publication_id, bipartite = 0, title=title, authors_counter = len(authors))
        for author in authors:
            G.add_node(author, bipartite = 1)
            G.add_edge(publication_id,author)
    return G

graph_list = list()
for df in df_list:
    graph_list.append(
        create_graph(df)
    )

In [31]:
# Pubblicazione con maggior numero di autori:

def get_publication_with_max_authors(G:nx.Graph)->tuple[str, list[str], int]:
    publication_ids = {n for n, d in G.nodes(data=True) if d["bipartite"] == 0}
    publication_ids = list(publication_ids)

    authors_counter_array = np.array(
        list(map(lambda id: G.nodes[id]["authors_counter"], publication_ids))
        )

    max_authors_pubication_id = publication_ids[authors_counter_array.argmax()]

    authors = G.neighbors(max_authors_pubication_id)
    title = G.nodes[max_authors_pubication_id]["title"]

    return (
        title,
        list(authors),
        G.nodes[max_authors_pubication_id]['authors_counter']
    )

for idx, G in enumerate(graph_list):
    title, authors, counter = get_publication_with_max_authors(G)
    print(f"-------------Graph: {df_list[idx].name}--------------")
    print(f"{title} \n {list(authors)} \n wich has {counter} authors\n")


-------------Graph: out-dblp_article--------------
Bringing Semantics to Web Services with OWL-S. 
 ['David L. Martin 0001', 'Deborah L. McGuinness', 'Drew V. McDermott', 'Evren Sirin', 'Katia P. Sycara', 'Mark H. Burstein', 'Massimo Paolucci 0001', 'Naveen Srinivasan', 'Sheila A. McIlraith'] 
 wich has 9 authors

-------------Graph: out-dblp_book--------------
A Framework of Information System Concepts (The FRISCO Report). 
 ['Alexander A. Verrijn-Stuart', 'Björn E. Nilsson', 'Colette Rolland', 'Eckhard D. Falkenberg', 'Frans Van Assche', 'J. L. Han Oei', 'Klaus Voss', 'Paul Lindgreen', 'Ronald K. Stamper', 'Wolfgang Hesse'] 
 wich has 10 authors

-------------Graph: out-dblp_incollection--------------
CoolEmAll: Models and Tools for Planning and Operating Energy Efficient Data Centres. 
 ['Andrew Donoghue', 'Ariel Oleksiak', 'Daniel Rathgeb', 'Enric Pages', 'Eugen Volk', 'Georges Da Costa', 'Georgina Gallizo', 'Jaume Salom', 'Jean-Marc Pierson', 'Jochen Buchholz', 'Lara Lopez', 'Laur

In [28]:
# Autore con maggior numero di collaboratori

def get_author_with_most_collaborotors(G:nx.Graph)->tuple:
    authors = {n for n, d in G.nodes(data=True) if d["bipartite"] == 1}
    authors = list(authors)

    max = {"author": "None","collaborators":list()}
    for author in authors:
        collaborators = set()
        publication_ids = [publication_id[1] for publication_id in list(G.edges(author))]
        for publication_id in publication_ids:
            collaborators.update(
                [publication_id[1] for publication_id in list(G.edges(publication_id))]
            )
        collaborators = list(collaborators)
        collaborators.remove(author)
        if len(collaborators)  > len(max["collaborators"]):
            max["author"] = author
            max["collaborators"] = collaborators
    return(
        max['author'],
        len(max['collaborators']),
        max['collaborators']
    )

for idx, G in enumerate(graph_list):
    author, count, collaborators = get_author_with_most_collaborotors(G)
    print(f"-------------Graph: {df_list[idx].name}--------------")
    print(f"The author with most collaborators is {author}, with {count} collaborators:\n{collaborators}\n")




-------------Graph: out-dblp_article.csv--------------
The author with most collaborators is Jean-Michel Knippel, with 8 collaborators:
['Clyde Chabot', 'Edmond Bianco', 'Mohamed Tayeb Laskri', 'Jules Nyssen', 'Denis Iwanesko', 'Renaud Litré', 'Christophe Bisière', 'Jean-Luc Massat']

-------------Graph: out-dblp_book.csv--------------
The author with most collaborators is Frans Van Assche, with 9 collaborators:
['Björn E. Nilsson', 'Wolfgang Hesse', 'Alexander A. Verrijn-Stuart', 'Paul Lindgreen', 'J. L. Han Oei', 'Colette Rolland', 'Klaus Voss', 'Eckhard D. Falkenberg', 'Ronald K. Stamper']

-------------Graph: out-dblp_incollection.csv--------------
The author with most collaborators is Tuncer I. Ören, with 37 collaborators:
['Navonil Mustafee', 'Alison Harper', 'Balqies Sadoun', 'Bernard P. Zeigler', 'Mohammad S. Obaidat', 'Yuanjun Laili', 'Azam Khan', 'Margaret L. Loper', 'Lance E. Champagne', 'Jean François Santucci', 'Lin Zhang 0009', 'Claudia Szabo', 'Umut Durak', 'Cláudio Gome

In [29]:
# Calcolo del Diametro

def calculate_B_i(G:nx.Graph, u:str, i:int):
    a = nx.eccentricity(G)
    F = list()
    for key in a.keys():
        if a[key] == i:
            F.append(key)
    B_i = 0
    for node in F:
        max = nx.eccentricity(G, v=node)
        if max > B_i:
            B_i = max
    return B_i

def calculate_starting_node(G:nx.Graph)->str:
    lcc = get_largest_connected_component(G)
    return list(lcc)[0]

def get_largest_connected_component(G:nx.Graph)->nx.Graph:
    return G.subgraph(
    sorted(nx.connected_components(G), key = len, reverse=True)[0]
    ).copy()

def iFub(G:nx.Graph,u:str)-> int:
    lcc = get_largest_connected_component(G)
    i = nx.eccentricity(lcc,v=u)

    lb = i
    ub = 2*lb

    while ub > lb:
        B_i = calculate_B_i(lcc,u,i)
        max = np.max([lb,B_i])
        if max > 2*(i-1):
            return max
        else:
            lb = max
            ub = 2*(i-1)
        i=i-1
    return lb

for idx, G in enumerate(graph_list):
    diameter = iFub(G, calculate_starting_node(G))
    nx_diameter = nx.diameter(get_largest_connected_component(G))
    print(f"Graph {df_list[idx].name} has diameter: {diameter}")
    print(f"NetworkX diameter is {nx_diameter}\n")



Graph out-dblp_article.csv has diameter: 7
NetworkX diameter is 7

Graph out-dblp_book.csv has diameter: 2
NetworkX diameter is 2

Graph out-dblp_incollection.csv has diameter: 6
NetworkX diameter is 7

Graph out-dblp_inproceedings.csv has diameter: 9
NetworkX diameter is 10

Graph out-dblp_mastersthesis.csv has diameter: 1
NetworkX diameter is 1

Graph out-dblp_phdthesis.csv has diameter: 1
NetworkX diameter is 1

Graph out-dblp_proceedings.csv has diameter: 4
NetworkX diameter is 4



In [40]:
def concatenate_DataFrame_from_list(df_list:list[pd.DataFrame])->pd.DataFrame:
    df = pd.concat(
        df_list,
        axis=0,
        ignore_index=True
    )
    df.drop_duplicates(
        subset='title',
        keep='first',
        inplace=True
    )
    return df

def build_union_graph_from_DataFrame_list(df_list:list[pd.DataFrame])->nx.Graph:
    return create_graph(
        concatenate_DataFrame_from_list(df_list)
    )



In [41]:
G = build_union_graph_from_DataFrame_list(df_list)

In [46]:
def answer_to_all_main_questions(G:nx.Graph,name:str)->None:
    print(f"-------------Graph: {name}--------------\n")

    title, authors, counter = get_publication_with_max_authors(G)
    print("The publication with most authors is:")
    print(f"{title} \n {list(authors)} \n wich has {counter} authors\n")

    diameter = iFub(G, calculate_starting_node(G))
    nx_diameter = nx.diameter(get_largest_connected_component(G))
    print(f"The graph has exact diameter: {diameter}")
    print(f"Diameter calculated with NetworkX is: {nx_diameter}\n")

    author, count, collaborators = get_author_with_most_collaborotors(G)
    print(f"The author with most collaborators is {author}, with {count} collaborators:\n{collaborators}\n")

In [47]:
answer_to_all_main_questions(
    G,
    "Union"
)

-------------Graph: Union--------------

The publication with most authors is:
R-GMA: An Information Integration System for Grid Monitoring. 
 ['Abdeslem Djaoui', 'Alasdair J. G. Gray', 'Andrew W. Cooke', 'Antony J. Wilson', 'Brian A. Coghlan', "David O'Callaghan", 'James Magowan', 'Jason Leake', 'Laurence Field', 'Linda Cornwall', 'Lisha Ma', 'Manfred Oevers', 'Manish Soni', 'Norbert Podhorszki', 'Paul Taylor', 'Rob Byrom', 'Roney Cordenonsi', 'Steve Fisher', 'Steve Hicks', 'Stuart Kenny', 'Werner Nutt'] 
 wich has 21 authors

The graph has exact diameter: 7
Diameter calculated with NetworkX is: 7

The author with most collaborators is Tuncer I. Ören, with 37 collaborators:
['Navonil Mustafee', 'Alison Harper', 'Balqies Sadoun', 'Bernard P. Zeigler', 'Mohammad S. Obaidat', 'Yuanjun Laili', 'Azam Khan', 'Margaret L. Loper', 'Lance E. Champagne', 'Jean François Santucci', 'Lin Zhang 0009', 'Claudia Szabo', 'Umut Durak', 'Cláudio Gomes 0001', 'Saurabh Mittal', 'Ernest H. Page', 'Rhys Gol