In [1]:
import networkx as nx
import pandas as pd
import numpy as np


In [2]:
import os

path = "Data"
csv_list = os.listdir(path)

In [3]:
def read_csv(name:str)->pd.DataFrame:
    cols = ["id","author","title"]
    if name == "out-dblp_proceedings.csv":
        cols = ["id","editor","title"]
        
    df = pd.read_csv(
    "Data/" + name,
    delimiter=";",
    usecols=cols,
    nrows=2000
    )
    df.name = name.split(".")[0]
    df.rename(columns={"editor":"author"}, inplace=True)
    return df


df_list = list()
for csv in csv_list:
    df_list.append(
        read_csv(csv)
    )
    df_list[-1].dropna(inplace = True)


In [4]:
def create_graph(df:pd.DataFrame)->nx.Graph:
    G = nx.Graph()
    for publication_id, row in df.iterrows():
        authors = row["author"].split("|")
        title = row["title"]
        G.add_node(publication_id, bipartite = 0, title=title, authors_counter = len(authors))
        for author in authors:
            G.add_node(author, bipartite = 1)
            G.add_edge(publication_id,author)
    return G

graph_list = list()
for df in df_list:
    graph_list.append(
        create_graph(df)
    )

In [5]:
# Pubblicazione con maggior numero di autori:

def get_publication_with_max_authors(G:nx.Graph)->tuple[str, list[str], int]:
    publication_ids = {n for n, d in G.nodes(data=True) if d["bipartite"] == 0}
    publication_ids = list(publication_ids)

    authors_counter_array = np.array(
        list(map(lambda id: G.nodes[id]["authors_counter"], publication_ids))
        )

    max_authors_pubication_id = publication_ids[authors_counter_array.argmax()]

    authors = G.neighbors(max_authors_pubication_id)
    title = G.nodes[max_authors_pubication_id]["title"]

    return (
        title,
        list(authors),
        G.nodes[max_authors_pubication_id]['authors_counter']
    )

for idx, G in enumerate(graph_list):
    title, authors, counter = get_publication_with_max_authors(G)
    print(f"-------------Graph: {df_list[idx].name}--------------")
    print(f"{title} \n {list(authors)} \n wich has {counter} authors\n")


-------------Graph: out-dblp_article--------------
Making Bertha Drive - An Autonomous Journey on a Historic Route. 
 ['Andreas Tamke', 'Armin Joos', 'Carsten Brenk', 'Carsten Knöppel', 'Christoph Gustav Keller', 'Christoph Stiller', 'Clemens Rabe', 'David Pfeiffer', 'Eberhard Kaus', 'Eberhard Zeeb', 'Frank Lindner', 'Fridtjof Stein', 'Friedrich Erbs', 'Hans Fritz', 'Henning Lategahn', 'Horst Mock', 'Jochen Hipp', 'Julius Ziegler', 'Markus Braun', 'Markus Enzweiler', 'Markus Schreiber', 'Martin Haueis', 'Martin Hein', 'Maximilian Trepte', 'Mohammad Ghanaat', 'Nils Appenrodt', 'Philipp Bender', 'Ralf G. Herrtwich', 'Thao Dang 0002', 'Tobias Strauss', 'Uwe Franke'] 
 wich has 31 authors

-------------Graph: out-dblp_book--------------
The Munich Project CIP, Volume I: The Wide Spectrum Language CIP-L 
 ['Alfred Laut', 'Bernd Krieg-Brückner', 'Bernhard Möller', 'E. Hangel', 'Franz Geiselbrechtinger', 'Friederike Nickl', 'Friedrich L. Bauer', 'Hans Wössner', 'Helmuth Partsch', 'Klaus Samel

In [6]:
# Autore con maggior numero di collaboratori

def get_author_with_most_collaborotors(G:nx.Graph)->tuple:
    authors = {n for n, d in G.nodes(data=True) if d["bipartite"] == 1}
    authors = list(authors)

    max = {"author": "None","collaborators":list()}
    for author in authors:
        collaborators = set()
        publication_ids = [publication_id[1] for publication_id in list(G.edges(author))]
        for publication_id in publication_ids:
            collaborators.update(
                [publication_id[1] for publication_id in list(G.edges(publication_id))]
            )
        collaborators = list(collaborators)
        collaborators.remove(author)
        if len(collaborators)  > len(max["collaborators"]):
            max["author"] = author
            max["collaborators"] = collaborators
    return(
        max['author'],
        len(max['collaborators']),
        max['collaborators']
    )

for idx, G in enumerate(graph_list):
    author, count, collaborators = get_author_with_most_collaborotors(G)
    print(f"-------------Graph: {df_list[idx].name}--------------")
    print(f"The author with most collaborators is {author}, with {count} collaborators:\n{collaborators}\n")




-------------Graph: out-dblp_article--------------
The author with most collaborators is Christoph Stiller, with 44 collaborators:
['Clemens Rabe', 'Thao Dang 0002', 'Markus Schreiber', 'Matthew J. Barth', 'Eberhard Kaus', 'Christian Laugier', 'Hans Fritz', 'Klaus Dietmayer', 'Julius Ziegler', 'Tobias Strauss', 'Felix Klanner', 'Carsten Knöppel', 'Christian Ruhhammer', 'Henning Lategahn', 'Markus Enzweiler', 'Uwe Franke', 'Eberhard Zeeb', 'Andreas Tamke', 'Mohammad Ghanaat', 'Jochen Hipp', 'Nils Appenrodt', 'Ralf G. Herrtwich', 'Philippe Martinet', 'Fridtjof Stein', 'Philippe Bonnifait', 'David Pfeiffer', 'Horst Mock', 'Armin Joos', 'Klaus Bengler', 'Christoph Gustav Keller', 'Berthold Färber', 'Martin Haueis', 'Markus Maurer', 'Urbano Nunes', 'Friedrich Erbs', 'Philipp Bender', 'Hermann Winner', 'Carsten Brenk', 'Markus Braun', 'Martin Hein', 'Maximilian Trepte', 'Martin Liebner', 'Frank Lindner', 'Michael Baumann 0005']

-------------Graph: out-dblp_book--------------
The author with

In [7]:
def get_largest_connected_component(G:nx.Graph)->nx.Graph:
    return G.subgraph(
    sorted(nx.connected_components(G), key = len, reverse=True)[0]
    ).copy()
    
def find_farther_node(starting_node:str)->list:
    edges = nx.bfs_edges(G,starting_node)
    edges = [starting_node] + [v for u, v in edges]
    return list(G.edges(edges[-1]))[0][0]

def two_sweep_path(G:nx.Graph,starting_node:str)->list:
    a = find_farther_node(starting_node)
    b = find_farther_node(a)
    return nx.shortest_path(G,a,b)

def calculate_starting_node(G:nx.Graph)->str:
    lcc = get_largest_connected_component(G)
    starting_node = list(lcc)[0]
    path = two_sweep_path(lcc,starting_node)
    median_idx = len(path) // 2
    return path[median_idx]




In [8]:
# Calcolo del Diametro

def calculate_B_i(G:nx.Graph, u:str, i:int):
    a = nx.eccentricity(G)
    F = list()
    for key in a.keys():
        if a[key] == i:
            F.append(key)
    B_i = 0
    for node in F:
        max = nx.eccentricity(G, v=node)
        if max > B_i:
            B_i = max
    return B_i

def iFub(G:nx.Graph,u:str)-> int:
    lcc = get_largest_connected_component(G)
    i = nx.eccentricity(lcc,v=u)

    lb = i
    ub = 2*lb

    while ub > lb:
        B_i = calculate_B_i(lcc,u,i)
        max = np.max([lb,B_i])
        if max > 2*(i-1):
            return max
        else:
            lb = max
            ub = 2*(i-1)
        i=i-1
    return lb

for idx, G in enumerate(graph_list):
    diameter = iFub(G, calculate_starting_node(G))
    nx_diameter = nx.diameter(get_largest_connected_component(G))
    print(f"Graph {df_list[idx].name} has diameter: {diameter}")
    print(f"NetworkX diameter is {nx_diameter}\n")



Graph out-dblp_article has diameter: 15
NetworkX diameter is 26

Graph out-dblp_book has diameter: 3
NetworkX diameter is 6

Graph out-dblp_incollection has diameter: 4
NetworkX diameter is 7

Graph out-dblp_inproceedings has diameter: 8
NetworkX diameter is 16

Graph out-dblp_mastersthesis has diameter: 1
NetworkX diameter is 1

Graph out-dblp_phdthesis has diameter: 1
NetworkX diameter is 2

Graph out-dblp_proceedings has diameter: 23
NetworkX diameter is 46



In [9]:
def concatenate_DataFrame_from_list(df_list:list[pd.DataFrame])->pd.DataFrame:
    df = pd.concat(
        df_list,
        axis=0,
        ignore_index=True
    )
    df.drop_duplicates(
        subset='title',
        keep='first',
        inplace=True
    )
    return df

def build_union_graph_from_DataFrame_list(df_list:list[pd.DataFrame])->nx.Graph:
    return create_graph(
        concatenate_DataFrame_from_list(df_list)
    )



In [10]:
G = build_union_graph_from_DataFrame_list(df_list)

In [11]:
def answer_to_all_main_questions(G:nx.Graph,name:str)->None:
    print(f"-------------Graph: {name}--------------\n")

    title, authors, counter = get_publication_with_max_authors(G)
    print("The publication with most authors is:")
    print(f"{title} \n {list(authors)} \n wich has {counter} authors\n")

    diameter = iFub(G, calculate_starting_node(G))
    nx_diameter = nx.diameter(get_largest_connected_component(G))
    print(f"The graph has exact diameter: {diameter}")
    print(f"Diameter calculated with NetworkX is: {nx_diameter}\n")

    author, count, collaborators = get_author_with_most_collaborotors(G)
    print(f"The author with most collaborators is {author}, with {count} collaborators:\n{collaborators}\n")

In [12]:
answer_to_all_main_questions(
    G,
    "Union"
)

-------------Graph: Union--------------

The publication with most authors is:
Making Bertha Drive - An Autonomous Journey on a Historic Route. 
 ['Andreas Tamke', 'Armin Joos', 'Carsten Brenk', 'Carsten Knöppel', 'Christoph Gustav Keller', 'Christoph Stiller', 'Clemens Rabe', 'David Pfeiffer', 'Eberhard Kaus', 'Eberhard Zeeb', 'Frank Lindner', 'Fridtjof Stein', 'Friedrich Erbs', 'Hans Fritz', 'Henning Lategahn', 'Horst Mock', 'Jochen Hipp', 'Julius Ziegler', 'Markus Braun', 'Markus Enzweiler', 'Markus Schreiber', 'Martin Haueis', 'Martin Hein', 'Maximilian Trepte', 'Mohammad Ghanaat', 'Nils Appenrodt', 'Philipp Bender', 'Ralf G. Herrtwich', 'Thao Dang 0002', 'Tobias Strauss', 'Uwe Franke'] 
 wich has 31 authors

The graph has exact diameter: 22
Diameter calculated with NetworkX is: 39

The author with most collaborators is Christoph Stiller, with 43 collaborators:
['Clemens Rabe', 'Thao Dang 0002', 'Markus Schreiber', 'Eberhard Kaus', 'Christian Laugier', 'Hans Fritz', 'Klaus Dietmaye

In [13]:
import itertools

def build_authors_graph_from_DataFrame_list(df_list:list[pd.DataFrame])->nx.Graph:
    df = concatenate_DataFrame_from_list(df_list)
    df = df["author"]
    G = nx.Graph()
    for authors in df:
        authors_list = authors.split("|")
        for author_comb in itertools.combinations(authors_list,2):
            if G.has_edge(*author_comb):
                G[author_comb[0]][author_comb[1]]["weight"] += 1
            else:
                G.add_edge(*author_comb,weight = 1)
    return G

def find_most_collaborating_couple(G:nx.Graph)->tuple[str,str,dict[int]]:
    return  sorted(G.edges(data=True),key= lambda x: x[2]['weight'],reverse=True)[0]

author_1, author_2, weight = find_most_collaborating_couple(
    build_authors_graph_from_DataFrame_list(df_list)
    )

print(f"The most collaborating authors are {author_1} and {author_2} with {weight['weight']} collaborations togheter ")

The most collaborating authors are Christian S. Jensen and Richard T. Snodgrass with 20 collaborations togheter 
