In [9]:
import kagglehub
import os
import ast

import pandas as pd
import networkx as nx
from collections import defaultdict
from itertools import combinations

In [10]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("nechbamohammed/research-papers-dataset")
print("Path to dataset files:", path)

Path to dataset files: /Users/andreavezzuto/.cache/kagglehub/datasets/nechbamohammed/research-papers-dataset/versions/1


In [11]:
# Read dataset into pandas dataframe
df = pd.read_csv(os.path.join(path, 'dblp-v10.csv'))
print(f"Loaded dblp-v10.csv with shape {df.shape}")

Loaded dblp-v10.csv with shape (1000000, 8)


In [12]:
# Some preprocessing... make the authors of a paper into a list
df['authors'] = df['authors'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

In [13]:
# Map storing the number of citations for a particular author
author_total_citations = defaultdict(int)

for authors, cites in zip(df['authors'], df['n_citation']):
    for a in authors:
        author_total_citations[a] += cites

list(author_total_citations.items())[:10]

[('S. Ben Jabra', 50),
 ('Ezzeddine Zagrouba', 155),
 ('Joaquín J. Torres', 360),
 ('Jesús M. Cortés', 284),
 ('Joaquín Marro', 346),
 ('Hilbert J. Kappen', 1601),
 ('Genevi eve Paquin', 50),
 ('Laurent Vuillon', 241),
 ('Yaser Sheikh', 2537),
 ('Mumtaz Sheikh', 221)]

In [14]:
G = nx.Graph()

# 1. Add nodes with attributes
for a in author_total_citations:
    G.add_node(a, total_citations=author_total_citations[a])

# 2. Add edges between nodes
for authors in df['authors']:
    # Add an edge for all pairs of coauthors for this paper
    for a1, a2 in combinations(authors, 2):
        if G.has_edge(a1, a2):
            G[a1][a2]['weight'] += 1
        else:
            G.add_edge(a1, a2, weight=1)

In [15]:
print("Authors (nodes):", G.number_of_nodes())
print("Coauthor edges:", G.number_of_edges())
print("Neighbours of S. Ben Jabra:", list(G.neighbors("S. Ben Jabra")))

Authors (nodes): 957242
Coauthor edges: 3814744
Neighbours of S. Ben Jabra: ['Ezzeddine Zagrouba']
