In [13]:
import kagglehub
import os
import ast

import numpy as np
import pandas as pd
import networkx as nx
from collections import defaultdict
from itertools import combinations

In [14]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("nechbamohammed/research-papers-dataset")
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'research-papers-dataset' dataset.
Path to dataset files: /kaggle/input/research-papers-dataset


In [15]:
# Read dataset into pandas dataframe
df = pd.read_csv(os.path.join(path, 'dblp-v10.csv'))
print(f"Loaded dblp-v10.csv with shape {df.shape}")

Loaded dblp-v10.csv with shape (1000000, 8)


In [16]:
# Some preprocessing... make the authors of a paper into a list
df['authors'] = df['authors'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

In [17]:
# We keep all papers until 2015. That way we can have a test set for papers from 2016 and 2017
END_YEAR = 2015
train_df = df[df["year"] <= END_YEAR].copy()
test_df  = df[df["year"] >  END_YEAR].copy()
df = train_df
print(f"After temporal filtering: {df.shape[0]} papers")

After temporal filtering: 768113 papers


In [18]:
# Count papers per author
author_paper_count = defaultdict(int)
for authors in df['authors']:
    for a in authors:
        author_paper_count[a] += 1

# Using the count we keep only authors with >=2 papers to reduce graph size further
eligible_authors = {a for a, c in author_paper_count.items() if c >= 2}
print(f"Eligible authors (>=2 papers): {len(eligible_authors)}")

def filter_authors(authors):
    return [a for a in authors if a in eligible_authors]

df['authors'] = df['authors'].apply(filter_authors)

# Remove papers with <1 eligible author
df = df[df['authors'].map(len) > 0]
print(f"After filtering for eligible authors: {df.shape[0]} papers")

Eligible authors (>=2 papers): 319246
After filtering for eligible authors: 735290 papers


In [19]:
# We reduce the dataset further to 20000 randomly selected papers. Based on my testing, this is the max we can have before graph feature calculations explode in terms of time complexity
TARGET_PAPERS = 20000
if df.shape[0] > TARGET_PAPERS:
    df = df.sample(n=TARGET_PAPERS, random_state=42).reset_index(drop=True)
    print(f"After random sampling: {df.shape[0]} papers")

After random sampling: 20000 papers


In [20]:
# Map storing the number of citations for a particular author
author_total_citations = defaultdict(int)

for authors, cites in zip(df['authors'], df['n_citation']):
    for a in authors:
        author_total_citations[a] += cites

list(author_total_citations.items())[:10]

[('Carlos Castillo', 485),
 ('Z. Li', 50),
 ('Dereck S. Meek', 50),
 ('Desmond J. Walton', 50),
 ('Alexandre Demeure', 77),
 ('Jean-Sébastien Sottet', 77),
 ('Gaëlle Calvary', 82),
 ('Joëlle Coutaz', 77),
 ('Jean Vanderdonckt', 165),
 ('Giuseppe Iannaccone', 50)]

In [21]:
G = nx.Graph()

# 1. Add nodes with attributes
for a in author_total_citations:
    G.add_node(a, total_citations=author_total_citations[a])

# 2. Add edges between nodes
for authors in df['authors']:
    # Add an edge for all pairs of coauthors for this paper
    for a1, a2 in combinations(authors, 2):
        if G.has_edge(a1, a2):
            G[a1][a2]['weight'] += 1
        else:
            G.add_edge(a1, a2, weight=1)

In [22]:
print("Authors (nodes):", G.number_of_nodes())
print("Coauthor edges:", G.number_of_edges())

Authors (nodes): 43325
Coauthor edges: 65588


In [23]:
# Prestige of an author in the coauthor network
pagerank = nx.pagerank(G, weight='weight')

In [24]:
# How much an author bridges different groups
betweenness = nx.betweenness_centrality(G, k = 500, seed = 42)

In [25]:
# How central an author is to the network
closeness = nx.closeness_centrality(G)

In [26]:
# Fraction of the author’s coauthors who have also collaborated with each other (group tightness)
clustering = nx.clustering(G, weight='weight')

In [27]:
# Compute mean citations per venue
venue_mean_citations = df.groupby('venue')['n_citation'].mean().to_dict()

# Compute total papers per venue
venue_paper_count = df.groupby('venue').size().to_dict()

In [28]:
#==============================
# GRAPH-BASED FEATURES PER AUTHOR
#==============================

# Node-level features dictionary
author_features_full = {}
for a in G.nodes():
    author_features_full[a] = {
        'pagerank': pagerank[a],
        'betweenness': betweenness[a],
        'closeness': closeness[a],
        'component_size': len(nx.node_connected_component(G, a)),
        'weighted_degree': sum(d['weight'] for _, _, d in G.edges(a, data=True)),
        'clustering': clustering[a],
    }


# Example usage:
# author_features['Some Author']['pagerank'] gives PageRank
# author_features['Some Author']['degree'] gives number of coauthors

#==============================
# PAPER-LEVEL GRAPH FEATURES
#==============================
# Aggregate author-level features per paper
def paper_graph_features(authors, author_features):
    known_authors = [a for a in authors if a in author_features]

    # aggregate stats per paper
    pr = np.array([author_features[a]['pagerank'] for a in known_authors])
    bt = np.array([author_features[a]['betweenness'] for a in known_authors])
    cl = np.array([author_features[a]['closeness'] for a in known_authors])
    cs = np.array([author_features[a]['component_size'] for a in known_authors])
    wdeg = np.array([author_features[a]['weighted_degree'] for a in known_authors])
    ct = np.array([author_features[a]['clustering'] for a in known_authors])

    return {
        'mean_pagerank': pr.mean() if len(pr) > 0 else 0,
        'max_pagerank': pr.max() if len(pr) > 0 else 0,
        'mean_betweenness': bt.mean() if len(bt) > 0 else 0,
        'max_betweenness': bt.max() if len(bt) > 0 else 0,
        'mean_closeness': cl.mean() if len(cl) > 0 else 0,
        'max_closeness': cl.max() if len(cl) > 0 else 0,
        'mean_component_size': cs.mean() if len(cs) > 0 else 0,
        'max_component_size': cs.max() if len(cs) > 0 else 0,
        'mean_weighted_degree': wdeg.mean() if len(wdeg) > 0 else 0,
        'max_weighted_degree': wdeg.max() if len(wdeg) > 0 else 0,
        'mean_clustering': ct.mean() if len(ct) > 0 else 0,
        'max_clustering': ct.max() if len(ct) > 0 else 0,
    }


# Apply to all papers
graph_features_full_df = df["authors"].apply(lambda a: paper_graph_features(a, author_features_full)).apply(pd.Series)
graph_features_full_df.head()

# Example:
# graph_features_df.iloc[0]['mean_pagerank'] gives average prestige of first paper's authors

Unnamed: 0,mean_pagerank,max_pagerank,mean_betweenness,max_betweenness,mean_closeness,max_closeness,mean_component_size,max_component_size,mean_weighted_degree,max_weighted_degree,mean_clustering,max_clustering
0,5.4e-05,5.4e-05,0.0,0.0,0.000115,0.000115,6.0,6.0,5.0,5.0,0.02,0.02
1,2.4e-05,2.4e-05,0.0,0.0,4.6e-05,4.6e-05,3.0,3.0,2.0,2.0,0.1,0.1
2,3.5e-05,5.9e-05,0.0,0.0,0.000165,0.000208,13.0,13.0,5.2,8.0,0.074333,0.1
3,4e-06,4e-06,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,2.4e-05,2.4e-05,0.0,0.0,2.3e-05,2.3e-05,2.0,2.0,1.0,1.0,0.0,0.0


In [29]:
# Count authors with non-zero betweenness. This feature has low variance but high meaningfulness when authors do diverge from the mean
num_nonzero_bt = sum(1 for v in betweenness.values() if v > 0)

num_authors = len(betweenness)

print(f"Authors with non-zero betweenness: {num_nonzero_bt} / {num_authors}")
print(f"Fraction: {num_nonzero_bt / num_authors:.4f}")

Authors with non-zero betweenness: 2568 / 43325
Fraction: 0.0593


In [30]:
#==============================
# NON-GRAPH FEATURES PER PAPER
#==============================
# - num_papers: how many papers an author has published
# - total_citations: total citations of an author
# - citations_per_paper: average citations per paper
# - venue: statistics on a particular venue (mean citations per paper at the venue, total number of papers at the venue)
author_papers_count = defaultdict(int)
author_citations_total = defaultdict(int)
for authors, cites in zip(df['authors'], df['n_citation']):
    for a in authors:
        author_papers_count[a] += 1
        author_citations_total[a] += cites

# Paper-level aggregation
def paper_non_graph_features(authors, venue, ng_state):


    author_papers_count   = ng_state["author_papers_count"]
    author_total_citations = ng_state["author_total_citations"]
    venue_mean_cites      = ng_state["venue_mean_cites"]
    venue_num_papers      = ng_state["venue_num_papers"]

    counts = [author_papers_count.get(a, 0) for a in authors]
    citations = [author_citations_total.get(a, 0) for a in authors]
    mean_cites_venue = venue_mean_citations.get(venue, np.nan)
    num_papers_venue = venue_paper_count.get(venue, 0)



    return {
        'mean_num_papers': np.mean(counts) if counts else 0,
        'max_num_papers': np.max(counts) if counts else 0,
        'mean_total_citations': np.mean(citations) if citations else 0,
        'max_total_citations': np.max(citations) if citations else 0,
        'sum_total_citations': np.sum(citations) if citations else 0,
        'venue_mean_citations': mean_cites_venue,
        'venue_num_papers': num_papers_venue
    }



In [31]:
def build_non_graph_state(df_fold):
    author_papers_count = defaultdict(int)
    author_total_citations = defaultdict(float)
    venue_num_papers = defaultdict(int)
    venue_total_citations = defaultdict(float)

    for authors, venue, cites in zip(df_fold["authors"], df_fold["venue"], df_fold["n_citation"]):
        venue_num_papers[venue] += 1
        venue_total_citations[venue] += (0 if pd.isna(cites) else cites)

        for a in authors:
            author_papers_count[a] += 1
            author_total_citations[a] += (0 if pd.isna(cites) else cites)

    venue_mean_cites = {
        v: (venue_total_citations[v] / venue_num_papers[v]) if venue_num_papers[v] > 0 else 0
        for v in venue_num_papers
    }

    return {
        "author_papers_count": author_papers_count,
        "author_total_citations": author_total_citations,
        "venue_num_papers": venue_num_papers,
        "venue_mean_cites": venue_mean_cites,
    }


In [32]:
ng_state_full = build_non_graph_state(df)

non_graph_features_full_df = df.apply(
    lambda row: paper_non_graph_features(row["authors"], row["venue"], ng_state_full),
    axis=1
).apply(pd.Series)

non_graph_features_full_df.head()

# Example:
# non_graph_features_full_df.iloc[0]['mean_num_papers'] is average productivity of authors of first paper

Unnamed: 0,mean_num_papers,max_num_papers,mean_total_citations,max_total_citations,sum_total_citations,venue_mean_citations,venue_num_papers
0,4.0,4.0,485.0,485.0,485.0,180.076923,13.0
1,1.0,1.0,50.0,50.0,150.0,53.0,5.0
2,1.8,4.0,95.6,165.0,478.0,52.5,2.0
3,1.0,1.0,50.0,50.0,50.0,39.666667,9.0
4,1.0,1.0,1.0,1.0,2.0,24.0,17.0


In [33]:
# Combined features and ground truths ready to be inputted
X_graph = graph_features_full_df
X_non_graph = non_graph_features_full_df
X_all = pd.concat([graph_features_full_df, non_graph_features_full_df], axis=1)
y = df["n_citation"].to_numpy()

# print("Graph features shape:", X_graph.shape)
print("Non-graph features shape:", X_non_graph.shape)
# print("Combined features shape:", X_all.shape)

Non-graph features shape: (20000, 7)


In [34]:
# Import ML libraries for model fitting
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

In [35]:
#===================
# MODEL DEFINITIONS
#===================

# Define feature sets for comparison
feature_sets = {
    'Graph Features': X_graph,
    'Non-Graph Features': X_non_graph,
    'All Features': X_all
}

# Define model pipelines (imputation -> scaling -> model)
def create_ridge_pipeline(alpha=1.0):
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=alpha, random_state=42))
    ])

def create_rf_pipeline(n_estimators=100, max_depth=15):
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('model', RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_leaf=5,
            n_jobs=-1,
            random_state=42
        ))
    ])

def create_mlp_pipeline(hidden_layers=(64, 32)):
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', MLPRegressor(
            hidden_layer_sizes=hidden_layers,
            activation='relu',
            solver='adam',
            alpha=0.001,  # L2 regularization
            max_iter=500,
            early_stopping=True,
            validation_fraction=0.1,
            random_state=42
        ))
    ])

models = {
    'Ridge Regression': create_ridge_pipeline,
    'Random Forest': create_rf_pipeline,
    'MLP': create_mlp_pipeline
}

print("Model pipelines defined successfully")

Model pipelines defined successfully


In [36]:
#================================
# FIT MODELS ON ALL FEATURE SETS
#================================

fitted_models = {}

for model_name, model_factory in models.items():
    print(f"\nFitting: {model_name}")

    for feature_name, X in feature_sets.items():
        # Create and fit model
        model = model_factory()
        model.fit(X, y)

        # Store fitted model
        key = (model_name, feature_name)
        fitted_models[key] = model

        print(f" + {feature_name}")

print(f"\n{len(fitted_models)} models fitted successfully")
print("Access with: fitted_models[('Model Name', 'Feature Set')]")


Fitting: Ridge Regression
 + Graph Features
 + Non-Graph Features
 + All Features

Fitting: Random Forest
 + Graph Features
 + Non-Graph Features
 + All Features

Fitting: MLP
 + Graph Features
 + Non-Graph Features
 + All Features

9 models fitted successfully
Access with: fitted_models[('Model Name', 'Feature Set')]


In [37]:
def build_author_features(df_fold):
  G_fold = nx.Graph()
  author_features = {}

  # Add edges between nodes
  for authors in df_fold['authors']:
      # Add an edge for all pairs of coauthors for this paper
      for a1, a2 in combinations(authors, 2):
          if G_fold.has_edge(a1, a2):
              G_fold[a1][a2]['weight'] += 1
          else:
              G_fold.add_edge(a1, a2, weight=1)
  if G_fold.number_of_nodes() == 0:
    return {}
  # Prestige of an author in the coauthor network
  pagerank_fold = nx.pagerank(G_fold, weight='weight')

  # How much an author bridges different groups
  k = min(500, G_fold.number_of_nodes())
  betweenness_fold = nx.betweenness_centrality(G_fold, k=k, seed = 42)

  # How central an author is to the network
  closeness_fold = nx.closeness_centrality(G_fold)

  # Fraction of the author’s coauthors who have also collaborated with each other (group tightness)
  clustering_fold = nx.clustering(G_fold, weight='weight')
  for a in G_fold.nodes():
    author_features[a] = {
        'pagerank': pagerank_fold[a],
        'betweenness': betweenness_fold[a],
        'closeness': closeness_fold[a],
        'component_size': len(nx.node_connected_component(G_fold, a)),
        'weighted_degree': sum(d['weight'] for _, _, d in G_fold.edges(a, data=True)),
        'clustering': clustering_fold[a],
    }


  return author_features


In [39]:
# Cross validation
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

cv = GroupKFold(n_splits=5)
groups = df["year"].to_numpy()
y = np.asarray(y)
y = np.nan_to_num(y, nan=0.0)


feature_names = ["Graph Features", "Non-Graph Features", "All Features"]
fold_cache = []


for train_idx, val_idx in cv.split(df, y, groups=groups):
    df_tr = df.iloc[train_idx]
    df_va = df.iloc[val_idx]

    y_tr = y[train_idx]
    y_va = y[val_idx]

    # fold-specific graph features (fit on train-fold only)
    author_features = build_author_features(df_tr)

    Xtr_graph = (
        df_tr["authors"]
        .apply(lambda a: paper_graph_features(a, author_features))
        .apply(pd.Series)
    )
    Xva_graph = (
        df_va["authors"]
        .apply(lambda a: paper_graph_features(a, author_features))
        .apply(pd.Series)
    )


    ng_state = build_non_graph_state(df_tr)

    Xtr_non = (
        df_tr.apply(lambda r: paper_non_graph_features(r["authors"], r["venue"], ng_state), axis=1)
            .apply(pd.Series)
    )
    Xva_non = (
        df_va.apply(lambda r: paper_non_graph_features(r["authors"], r["venue"], ng_state), axis=1)
            .apply(pd.Series)
    )

    Xtr_graph = Xtr_graph.fillna(0)
    Xva_graph = Xva_graph.fillna(0)

    Xtr_non = Xtr_non.fillna(0)
    Xva_non = Xva_non.fillna(0)



    fold_cache.append((Xtr_graph, Xva_graph, Xtr_non, Xva_non, y_tr, y_va))




rows = []

for model_name, model_factory in models.items():
  for feat_name in feature_names:
    rmse_folds, mae_folds, r2_folds = [], [], []
    for Xtr_graph, Xva_graph, Xtr_non, Xva_non, y_tr, y_va in fold_cache:
      # choose feature set for this run
      if feat_name == "Graph Features":
          Xtr, Xva = Xtr_graph, Xva_graph
      elif feat_name == "Non-Graph Features":
          Xtr, Xva = Xtr_non, Xva_non
      else:  # "All Features"
          Xtr = pd.concat([Xtr_graph, Xtr_non], axis=1)
          Xva = pd.concat([Xva_graph, Xva_non], axis=1)

      model = model_factory()
      model.fit(Xtr, y_tr)
      pred = model.predict(Xva)

      rmse_folds.append(np.sqrt(mean_squared_error(y_va, pred)))
      mae_folds.append(mean_absolute_error(y_va, pred))
      r2_folds.append(r2_score(y_va, pred))

    rows.append({
        "Model": model_name,
        "Features": feat_name,
        "CV_RMSE_mean": float(np.mean(rmse_folds)),
        "CV_RMSE_std":  float(np.std(rmse_folds, ddof=1)),
        "CV_MAE_mean":  float(np.mean(mae_folds)),
        "CV_R2_mean":   float(np.mean(r2_folds)),
    })

cv_results = pd.DataFrame(rows).sort_values(["CV_RMSE_mean", "CV_MAE_mean"])
cv_results


Unnamed: 0,Model,Features,CV_RMSE_mean,CV_RMSE_std,CV_MAE_mean,CV_R2_mean
4,Random Forest,Non-Graph Features,99.338417,41.542615,12.53467,0.739458
5,Random Forest,All Features,99.884424,41.101495,12.887066,0.73665
1,Ridge Regression,Non-Graph Features,113.135086,60.399872,42.968769,0.626481
2,Ridge Regression,All Features,115.127265,63.246641,42.362508,0.616692
8,MLP,All Features,134.498448,77.828938,57.805159,0.410587
6,MLP,Graph Features,193.412647,61.746124,45.410678,-0.007718
7,MLP,Non-Graph Features,194.346495,83.246058,142.885161,-0.053256
0,Ridge Regression,Graph Features,196.113084,63.594979,50.86292,-0.032879
3,Random Forest,Graph Features,198.182264,62.756471,51.27882,-0.061787
