In [105]:
import kagglehub
import os
import ast

import numpy as np
import pandas as pd
import networkx as nx
from collections import defaultdict
from itertools import combinations

In [106]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("nechbamohammed/research-papers-dataset")
print("Path to dataset files:", path)

Path to dataset files: /Users/andreavezzuto/.cache/kagglehub/datasets/nechbamohammed/research-papers-dataset/versions/1


In [107]:
# Read dataset into pandas dataframe
df = pd.read_csv(os.path.join(path, 'dblp-v10.csv'))
print(f"Loaded dblp-v10.csv with shape {df.shape}")

Loaded dblp-v10.csv with shape (1000000, 8)


In [108]:
# Some preprocessing... make the authors of a paper into a list
df['authors'] = df['authors'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

In [109]:
# We need to reduce the size of our data for graph-based analysis. We select papers between 2009 and 2010, which are about 100k
START_YEAR = 2009
END_YEAR = 2010
df = df[(df['year'] >= START_YEAR) & (df['year'] <= END_YEAR)]
print(f"After temporal filtering: {df.shape[0]} papers")

After temporal filtering: 104493 papers


In [110]:
# Count papers per author
author_paper_count = defaultdict(int)
for authors in df['authors']:
    for a in authors:
        author_paper_count[a] += 1

# Using the count we keep only authors with >=2 papers to reduce graph size further
eligible_authors = {a for a, c in author_paper_count.items() if c >= 2}
print(f"Eligible authors (>=2 papers): {len(eligible_authors)}")

def filter_authors(authors):
    return [a for a in authors if a in eligible_authors]

df['authors'] = df['authors'].apply(filter_authors)

# Remove papers with <1 eligible author
df = df[df['authors'].map(len) > 0]
print(f"After filtering for eligible authors: {df.shape[0]} papers")

Eligible authors (>=2 papers): 60572
After filtering for eligible authors: 88880 papers


In [111]:
# We reduce the dataset further to 3000 randomly selected papers. Based on my testing, this is the max we can have before graph feature calculations explode in terms of time complexity
TARGET_PAPERS = 3000
if df.shape[0] > TARGET_PAPERS:
    df = df.sample(n=TARGET_PAPERS, random_state=42).reset_index(drop=True)
    print(f"After random sampling: {df.shape[0]} papers")

After random sampling: 3000 papers


In [112]:
# Map storing the number of citations for a particular author
author_total_citations = defaultdict(int)

for authors, cites in zip(df['authors'], df['n_citation']):
    for a in authors:
        author_total_citations[a] += cites

list(author_total_citations.items())[:10]

[('Js Li', 50),
 ('Yang Shen', 0),
 ('Jakub T. MoÅ›cicki', 156),
 ('H. Lee', 156),
 ('Xuemei Chen', 1),
 ('Dimitrios Katsaros', 0),
 ('Xing Su', 50),
 ('Tiejun Huang', 50),
 ('Wen Gao', 65),
 ('Anju Verma', 50)]

In [113]:
G = nx.Graph()

# 1. Add nodes with attributes
for a in author_total_citations:
    G.add_node(a, total_citations=author_total_citations[a])

# 2. Add edges between nodes
for authors in df['authors']:
    # Add an edge for all pairs of coauthors for this paper
    for a1, a2 in combinations(authors, 2):
        if G.has_edge(a1, a2):
            G[a1][a2]['weight'] += 1
        else:
            G.add_edge(a1, a2, weight=1)

In [114]:
print("Authors (nodes):", G.number_of_nodes())
print("Coauthor edges:", G.number_of_edges())

Authors (nodes): 6547
Coauthor edges: 7299


In [115]:
# Prestige of an author in the coauthor network
pagerank = nx.pagerank(G, weight='weight')

In [116]:
# How much an author bridges different groups
betweenness = nx.betweenness_centrality(G)

In [117]:
# How central an author is to the network
closeness = nx.closeness_centrality(G)

In [118]:
# Define top-K influential authors by PageRank
top_k = 50
top_authors = sorted(pagerank, key=pagerank.get, reverse=True)[:top_k]

# dictionary mapping authors to min distance to any top 50 author
min_distances = {}

for a in G.nodes():
    dists_to_top = [nx.shortest_path_length(G, source=a, target=t)
                    for t in top_authors if nx.has_path(G, a, t)]
    if dists_to_top:
        min_distances[a] = min(dists_to_top)
    else:
        # Edge case: no top author is reachable (we might want to change this to a more reasonable number)
        min_distances[a] = np.nan

In [119]:
# Compute mean citations per venue
venue_mean_citations = df.groupby('venue')['n_citation'].mean().to_dict()

# Compute total papers per venue
venue_paper_count = df.groupby('venue').size().to_dict()

In [120]:
#==============================
# GRAPH-BASED FEATURES PER AUTHOR
#==============================

# Node-level features dictionary
author_features = {}
for a in G.nodes():
    author_features[a] = {
        'pagerank': pagerank[a],
        'betweenness': betweenness[a],
        'closeness': closeness[a],
        'min_distance_to_top_author': min_distances[a],
        'weighted_degree': sum(d['weight'] for _, _, d in G.edges(a, data=True)),
    }


# Example usage:
# author_features['Some Author']['pagerank'] gives PageRank
# author_features['Some Author']['degree'] gives number of coauthors

#==============================
# PAPER-LEVEL GRAPH FEATURES
#==============================
# Aggregate author-level features per paper
def paper_graph_features(authors):
    known_authors = [a for a in authors if a in author_features]

    # aggregate stats per paper
    pr = np.array([author_features[a]['pagerank'] for a in known_authors])
    bt = np.array([author_features[a]['betweenness'] for a in known_authors])
    cl = np.array([author_features[a]['closeness'] for a in known_authors])
    md = np.array([author_features[a]['min_distance_to_top_author'] for a in known_authors])
    wdeg = np.array([author_features[a]['weighted_degree'] for a in known_authors])

    return {
        'mean_pagerank': pr.mean() if len(pr) > 0 else 0,
        'max_pagerank': pr.max() if len(pr) > 0 else 0,
        'std_pagerank': pr.std() if len(pr) > 0 else 0,
        'mean_betweenness': bt.mean() if len(bt) > 0 else 0,
        'max_betweenness': bt.max() if len(bt) > 0 else 0,
        'mean_closeness': cl.mean() if len(cl) > 0 else 0,
        'max_closeness': cl.max() if len(cl) > 0 else 0,
        'mean_min_distance_to_top_author': md.mean() if len(md) > 0 else 0,
        'max_min_distance_to_top_author': md.max() if len(md) > 0 else 0,
        'mean_weighted_degree': wdeg.mean() if len(wdeg) > 0 else 0,
        'max_weighted_degree': wdeg.max() if len(wdeg) > 0 else 0,
    }


# Apply to all papers
graph_features_df = df['authors'].apply(paper_graph_features).apply(pd.Series)
graph_features_df.head()

# Example:
# graph_features_df.iloc[0]['mean_pagerank'] gives average prestige of first paper's authors

Unnamed: 0,mean_pagerank,max_pagerank,std_pagerank,mean_betweenness,max_betweenness,mean_closeness,max_closeness,mean_min_distance_to_top_author,max_min_distance_to_top_author,mean_weighted_degree,max_weighted_degree
0,2.6e-05,2.6e-05,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0
1,2.6e-05,2.6e-05,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0
2,0.000171,0.000171,0.0,0.0,0.0,0.000153,0.000153,,,1.0,1.0
3,2.6e-05,2.6e-05,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0
4,2.6e-05,2.6e-05,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0


In [121]:
#==============================
# NON-GRAPH FEATURES PER PAPER
#==============================
# - num_papers: how many papers an author has published
# - total_citations: total citations of an author
# - citations_per_paper: average citations per paper
# - venue: statistics on a particular venue (mean citations per paper at the venue, total number of papers at the venue)
author_papers_count = defaultdict(int)
author_citations_total = defaultdict(int)
for authors, cites in zip(df['authors'], df['n_citation']):
    for a in authors:
        author_papers_count[a] += 1
        author_citations_total[a] += cites

# Paper-level aggregation
def paper_non_graph_features(authors, venue):
    counts = [author_papers_count.get(a, 0) for a in authors]
    citations = [author_citations_total.get(a, 0) for a in authors]
    mean_cites_venue = venue_mean_citations.get(venue, np.nan)
    num_papers_venue = venue_paper_count.get(venue, 0)

    return {
        'mean_num_papers': np.mean(counts) if counts else 0,
        'max_num_papers': np.max(counts) if counts else 0,
        'mean_total_citations': np.mean(citations) if citations else 0,
        'max_total_citations': np.max(citations) if citations else 0,
        'sum_total_citations': np.sum(citations) if citations else 0,
        'venue_mean_citations': mean_cites_venue,
        'venue_num_papers': num_papers_venue
    }

non_graph_features_df = df.apply(
    lambda row: paper_non_graph_features(row['authors'], row['venue']), axis=1
).apply(pd.Series)
non_graph_features_df.head()

# Example:
# non_graph_features_df.iloc[0]['mean_num_papers'] is average productivity of authors of first paper

Unnamed: 0,mean_num_papers,max_num_papers,mean_total_citations,max_total_citations,sum_total_citations,venue_mean_citations,venue_num_papers
0,1.0,1.0,50.0,50.0,50.0,79.5,4.0
1,1.0,1.0,0.0,0.0,0.0,,0.0
2,1.0,1.0,156.0,156.0,312.0,103.0,2.0
3,1.0,1.0,1.0,1.0,1.0,,0.0
4,1.0,1.0,0.0,0.0,0.0,5.0,2.0


In [122]:
# Combined features and ground truths ready to be inputted
X_graph = graph_features_df
X_non_graph = non_graph_features_df
X_all = pd.concat([graph_features_df, non_graph_features_df, df_meta], axis=1)
y = df['n_citation']

print("Graph features shape:", X_graph.shape)
print("Non-graph features shape:", X_non_graph.shape)
print("Combined features shape:", X_all.shape)

Graph features shape: (3000, 11)
Non-graph features shape: (3000, 7)
Combined features shape: (3000, 20)
