# Analysis of individual authors

In [1]:
from __future__ import annotations
from dataclasses import dataclass, field
import tqdm
import json
import numpy as np
import matplotlib.pyplot as plt
import numba
from scipy.optimize import minimize_scalar
from collections import defaultdict
import scipy.special
import scipy.optimize

## Data preprocessing

In [2]:
dblp_file_path = '../../data/dblp/dblp.v12.json' # Path to the DBLP dataset (12th version)

### Helper classes

In [3]:
@dataclass
class Article:
    id: int # ID of the article from DBLP
    year: int # Year of publication
    authors: list[int] # List of author IDs
    references: list[int] # List of reference IDs
    data_citations: int # Number of citations from DBLP
    citations: list[Article] = field(default_factory=list) # List of citing articles


In [4]:
@dataclass
class Author:
    id: int # ID of the author from DBLP
    articles: list[Article] = field(default_factory=list) # List of articles written by the author
    citation_events: list[tuple[Article, Article, int]] = field(default_factory=list) # List of citation events in the form (citing article, cited article, citation type)
    log_likelihood_partial: list[tuple[float, ...]] = field(default_factory=list) # list of partial calculations for the log-likelihood
    external_citations: int = 0 # Total number of external citations
    self_citations: int = 0 # Total number of self-citations

### Data loading

In [None]:
articles = {} # Dictionary of articles indexed by their ID

with open(dblp_file_path, encoding = 'utf8') as file:
    for line in tqdm.tqdm(file, total = 4894083):
        if line[0] == ',':
            line = line[1:]
        if line[0] != '{':
            continue
        
        article_json = json.loads(line)

        # Filter out articles that do not have the required fields
        if 'authors' not in article_json or 'year' not in article_json or 'references' not in article_json or 'n_citation' not in article_json:
            continue
        
        article_authors = [int(author['id']) for author in article_json['authors']]
        article_id = int(article_json['id'])
        article_year = int(article_json['year'])
        article_references = [int(reference_id) for reference_id in article_json['references']]
        article_ncitations = int(article_json['n_citation'])

        article = Article(id = article_id, year = article_year, authors = article_authors, references = article_references, data_citations = article_ncitations)
        articles[article_id] = article

f'Number of articles: {len(articles)}'

In [None]:
empty_references = set() # Set of reference IDs that are not in the dataset
                         # (i.e. filtered out in the previous step or missing)
num_references = 0 # Total number of references
num_empty_references = 0 # Number of references that are not in the dataset

for article in tqdm.tqdm(articles.values()): # Recreate citations based on references
    for reference_id in article.references:
        if reference_id in articles:
            articles[reference_id].citations.append(article)
        else:
            empty_references.add(reference_id)
            num_empty_references += 1
        num_references += 1

f'Empty references: {100 * num_empty_references / num_references:.2f}%'

In [None]:
authors = {} # Dictionary of authors indexed by their ID

for article in tqdm.tqdm(articles.values()):
    for author_id in article.authors:
        if author_id not in authors:
            authors[author_id] = Author(id = author_id)
        authors[author_id].articles.append(article)

f'Number of authors: {len(authors)}'

In [None]:
 # Recreate citation events - citation_type = 1 for self-citations, 0 for external citations  

for author in tqdm.tqdm(authors.values()):
    for article in author.articles:
        for citing_article in article.citations:
            citation_type = 1 if author.id in citing_article.authors else 0
            if citation_type == 0:
                author.external_citations += 1
            else:
                author.self_citations += 1
            author.citation_events.append((article, citing_article, citation_type))

In [None]:
for author in tqdm.tqdm(authors.values()): # Sort articles and citation events by year
    author.articles.sort(key = lambda article: article.year)
    author.citation_events.sort(key = lambda event: event[1].year)

## Analysis

### Helper functions

In [10]:
def recreate_citation_vector(author):
    """A function that recreates the citation vector for an author
    based on the number of citations in the citation network."""
    
    citation_vector = np.array([len(article.citations) for article in author.articles])
    return citation_vector

In [11]:
def recreate_citation_vector_filtered(author, filter_type):
    """A function that recreates the citation vector for an author
    based on the number of citations in the citation network, with
    a filter applied (0 - external, 1 - self-citations)."""
    
    citations = defaultdict(int)

    for event in author.citation_events:
        if event[0].year > event[1].year:
            continue

        if event[2] is not filter_type:
            continue
        
        citations[event[0].id] += 1
    
    citations = [citations[article.id] for article in author.articles]

    return np.array(citations)

In [11]:
# The idea is to calculate the probability of a citation event under
# various models (here random and PAR) separately and then combine
# them with different weights

def model(published_articles, citation_event):
    """A function that calculates the probability of a citation event under
    the random and preferential attachment models.""" 
    
    published_articles_filtered = {article_id: citations for article_id, citations in published_articles.items() if article_id != citation_event[1].id}
    random_dist = 1 / len(published_articles_filtered)
    sum_citations = sum(published_articles_filtered.values())
    preferential_dist = published_articles_filtered[citation_event[0].id] / sum(published_articles_filtered.values()) if sum_citations > 0 else 0
    return np.array([random_dist, preferential_dist])

In [12]:
def calculate_partial_log_likelihood(author, model, filter_type = None, **kwargs):
    """A function that calculates the partial likelihood values for an author"""

    published_articles = {}
    last_year = None
    last_year_dump = None

    author.log_likelihood_partial.clear()

    temp_citations = defaultdict(int)

    for event in author.citation_events:
        if event[0].year > event[1].year:
            continue

        if last_year_dump != event[1].year:
            for article_id, count in temp_citations.items():
                published_articles[article_id] += count
            last_year_dump = event[1].year
            temp_citations.clear()

        if last_year != event[1].year:
            for article in author.articles:
                if article.year <= event[1].year and article.id not in published_articles:
                    published_articles[article.id] = 0
        last_year = event[1].year

        if filter_type is None or event[2] == filter_type:
            author.log_likelihood_partial.append(model(published_articles, event, **kwargs))
        
        temp_citations[event[0].id] += 1

    for article_id, count in temp_citations.items():
        published_articles[article_id] += count

    return published_articles

In [13]:
def calculate_partial_log_likelihood_all(authors, model, filter_type = None, **kwargs):
    """A function that calculates the partial likelihood values for all authors"""
    
    for author in tqdm.tqdm(authors.values()):
        if len(author.articles) < 10 or len(author.citation_events) == 0:
            continue
        
        calculate_partial_log_likelihood(author, model, filter_type, **kwargs)

In [14]:
def log_prob(partials, weights):
    """A function that calculates the log-likelihood value for
    a single citation event given set of partial values and weights"""    

    return np.log(np.dot(partials, weights))

In [15]:
def calculate_log_likelihood(author, weights):
    """A function that calculates the log-likelihood value for an author
    given a set of weights"""
    
    log_likelihood = 0

    for partials in author.log_likelihood_partial:
        log_likelihood += log_prob(partials, weights)

    return log_likelihood

In [16]:
def calculate_log_likelihood_all(authors, weights):
    """A function that calculates the log-likelihood value for all authors
    given a set of weights"""

    log_likelihood = 0

    for author in tqdm.tqdm(authors.values()):
        for partials in author.log_likelihood_partial:
            log_likelihood += log_prob(partials, weights)

    return log_likelihood

In [14]:
def maximize_model(author):
    """A function that maximizes the log-likelihood value for an author
    and finds the optimal value of $\rho$, that is 1 - $\alpha$."""

    def f(alpha):
        return -calculate_log_likelihood(author, np.array([alpha, 1-alpha]))
    
    result = minimize_scalar(f, bounds=(0.01, 0.99))
    assert result.success

    return result.x


### Calculations

In [None]:
calculate_partial_log_likelihood_all(authors, model)

In [None]:
results_all = {} # Dictionary of results for all citations indexed by author ID
for author in tqdm.tqdm(authors.values()):
    if not hasattr(author, 'log_likelihood_partial') or len(author.log_likelihood_partial) == 0:
        continue

    results_all[author.id] = maximize_model(author)

In [None]:
calculate_partial_log_likelihood_all(authors, model, filter_type = 0)

In [None]:
results_external = {} # Dictionary of results for external citations indexed by author ID
for author in tqdm.tqdm(authors.values()):
    if not hasattr(author, 'log_likelihood_partial') or len(author.log_likelihood_partial) == 0:
        continue

    results_external[author.id] = maximize_model(author)

In [None]:
calculate_partial_log_likelihood_all(authors, model, filter_type = 1)

In [None]:
results_self = {} # Dictionary of results for self-citations indexed by author ID
for author in tqdm.tqdm(authors.values()):
    if not hasattr(author, 'log_likelihood_partial') or len(author.log_likelihood_partial) == 0:
        continue

    results_self[author.id] = maximize_model(author)

### Comparison with 3DSI

In [None]:
model3_log = lambda param, x, C, N: np.log((1.0-param[0])*C)+np.log(scipy.special.poch(x,-param[0])/scipy.special.poch(N+1,-param[0]) - 1.0)-np.log(N*param[0])

def fit_model(x, y, model_log, param0, bounds, args=[], loss='cauchy'):
    def resid(param, x, y, args):
        return model_log(param, x, *args)-np.log(y)
    
    return scipy.optimize.least_squares(resid, param0, kwargs={"x": x, "y": y, "args": args},
           bounds=bounds, loss=loss)

results_3dsi = {}
for author in tqdm.tqdm(authors.values()):
    if not hasattr(author, 'log_likelihood_partial') or len(author.log_likelihood_partial) == 0:
        continue
    
    _x = recreate_citation_vector(author)
    _x = _x[_x > 0]
    _x[::-1].sort()

    res = fit_model(np.arange(1, len(_x)+1), _x, model3_log, [0.5], args=[_x.sum(), _x.shape[0]], bounds=([1e-6], [1-1e-6]), loss='cauchy')
    assert res["success"]

    results_3dsi[author.id] = res["x"][0]

### Saving the results

In [None]:
author_count = 0
with open('results/model_random_par_citations_separate.csv', 'w') as file:
    file.write(f'id,articles,citations,external_citations,self_citations,alpha_all,alpha_external,alpha_self,3dsi\n')
    for author in tqdm.tqdm(authors.values()):
        if not hasattr(author, 'log_likelihood_partial') or len(author.log_likelihood_partial) == 0 or author.id not in results_all or author.id not in results_external or author.id not in results_self:
            continue
        
        author_count += 1

        file.write(f'{author.id},{len(author.articles)},{len(author.citation_events)},{author.external_citations},{author.self_citations},{results_all[author.id]},{results_external[author.id]},{results_self[author.id]},{results_3dsi[author.id]}\n')

f'Number of authors: {len(author_count)}'