In [1]:
import kagglehub
import os
import ast

import pandas as pd
import networkx as nx
from collections import defaultdict
from itertools import combinations

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("nechbamohammed/research-papers-dataset")
print("Path to dataset files:", path)

Path to dataset files: /Users/andreavezzuto/.cache/kagglehub/datasets/nechbamohammed/research-papers-dataset/versions/1


In [3]:
# Read dataset into pandas dataframe
df = pd.read_csv(os.path.join(path, 'dblp-v10.csv'))
print(f"Loaded dblp-v10.csv with shape {df.shape}")

Loaded dblp-v10.csv with shape (1000000, 8)


In [4]:
# Some preprocessing... make the authors of a paper into a list
df['authors'] = df['authors'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

In [5]:
# Map storing the number of citations for a particular author
author_total_citations = defaultdict(int)

for authors, cites in zip(df['authors'], df['n_citation']):
    for a in authors:
        author_total_citations[a] += cites

list(author_total_citations.items())[:10]

[('S. Ben Jabra', 50),
 ('Ezzeddine Zagrouba', 155),
 ('Joaquín J. Torres', 360),
 ('Jesús M. Cortés', 284),
 ('Joaquín Marro', 346),
 ('Hilbert J. Kappen', 1601),
 ('Genevi eve Paquin', 50),
 ('Laurent Vuillon', 241),
 ('Yaser Sheikh', 2537),
 ('Mumtaz Sheikh', 221)]

In [8]:
G = nx.Graph()

# 1. Add nodes with attributes
for a in author_total_citations:
    G.add_node(a, total_citations=author_total_citations[a])

# 2. Add edges between nodes
for authors in df['authors']:
    # all pairs of coauthors for this paper
    for a1, a2 in combinations(authors, 2):
        if G.has_edge(a1, a2):
            G[a1][a2]['weight'] += 1
        else:
            G.add_edge(a1, a2, weight=1)

0                        [S. Ben Jabra, Ezzeddine Zagrouba]
1         [Joaquín J. Torres, Jesús M. Cortés, Joaquín M...
2                      [Genevi eve Paquin, Laurent Vuillon]
3               [Yaser Sheikh, Mumtaz Sheikh, Mubarak Shah]
4         [Efraim Laksman, Håkan Lennerstad, Magnus Nils...
                                ...                        
999995                     [Jeril Kuriakose, Sandeep Joshi]
999996              [Anh Khoa Bui, ZheKai Xiao, Liter Siek]
999997                [Sarah E. Ballinger, Thomas A. Adams]
999998                [Ben London, Bert Huang, Lise Getoor]
999999                         [Andrea Mesiarová-Zemánková]
Name: authors, Length: 1000000, dtype: object


In [7]:
print("Authors (nodes):", G.number_of_nodes())
print("Coauthor edges:", G.number_of_edges())
print("Neighbours of S. Ben Jabra:", list(G.neighbors("S. Ben Jabra")))

Authors (nodes): 957242
Coauthor edges: 3814744
Neighbours of S. Ben Jabra: ['Ezzeddine Zagrouba']
