# Rank authors
For any given article in the test dataset, compute the distance between the articles and authors in the train data set. For each article, rank the authors by distance. Expect the correct article authors to be at a shorter than average distance to the articles.



In [1]:
import pandas as pd
import pickle
import os

In [2]:
DATA_PATH = '../data'
MODELS_PATH = '../models'

## load the article metadata

In [3]:
def load_df(name):
    return pd.read_csv(
        os.path.join(DATA_PATH, name), 
        index_col=0, 
        converters={"authors_parsed": lambda x:[entry.replace("'", '').strip("[]") for entry in x.split("', '")]}
    )

validate_df = load_df('validate_topics9.csv')
test_df = load_df('test_topics9.csv')
train_df = load_df('train_topics9.csv')

## Rank authors by distance to a given article

In [4]:
# load the topics by author
authors_validate_df = pd.read_csv(os.path.join(DATA_PATH, 'validate_topics9_authors.csv'), index_col=0)
authors_test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_topics9_authors.csv'), index_col=0)
authors_train_df = pd.read_csv(os.path.join(DATA_PATH, 'train_topics9_authors.csv'), index_col=0)

In [5]:
import numpy as np

def topic_distance(topics1, topics2):
    """
    topic1, topic2: numpy.ndarray, representing the probability that an article is about a topic. An article can have multiple topics.
    Example:
        0    0.000000
        1    0.000000
        2    0.000000
        3    0.000000
        4    0.000000
        5    0.992668
    topic1 and topic2 must be the same length.
    Topics were assigned to articles in 03_assign_topics
    """
    dist = np.linalg.norm(topics1 - topics2)  # euclidean distance, L2 norm is default
    return dist

In [16]:
mask = validate_df.columns.str.contains("topic9_.")
validate_df.loc[:, mask].head()

Unnamed: 0,topic9_0,topic9_1,topic9_2,topic9_3,topic9_4,topic9_5,topic9_6,topic9_7,topic9_8
0,0.0,0.052155,0.0,0.0,0.0,0.0,0.0,0.074512,0.867151
1,0.179859,0.0,0.492827,0.0,0.0,0.100088,0.207556,0.015416,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.181438,0.025778,0.78473
3,0.0,0.620412,0.0,0.0,0.0,0.053725,0.087969,0.0,0.229827
4,0.0,0.0,0.0,0.0,0.0,0.068422,0.0,0.926342,0.0


In [129]:
def distances(article, authors_df):
    """
    Get the distance between article and all authors, in the topic space 
    
    article: pandas.DataFrame data for a single article
    authors_df: pandas.DataFrame data for all authors
    return: pandas.DataFrame in the form distance, author
    """
    # numpy array with topics probabilities for the article
    mask = article.columns.str.contains("topic9_.")
    topics1 = np.array(article.loc[:, mask])
    # compute distances between article topics and topics of all authors
    topics_authors_np = authors_df[authors_df.columns.drop('author')].to_numpy() 
    distances = [topic_distance(topics1, topics2) for topics2 in topics_authors_np]
    distances_df = pd.DataFrame({'author': authors_df['author'], 'distance': distances})
    distances_df.sort_values(by='distance', ascending=True, inplace=True)
    return distances_df


def rank_author(article, authors_df):
    """
    Get the lowest rank of any of the authors of article within authors_df.
    
    article: pandas.DataFrame data for a single article
    authors_df: pandas.DataFrame data for all authors
    return: int rank of closest author or None if not found
    """
    rank = 0
    found = False
    distances_df = distances(article, authors_df)
    for dataset_author in distances_df.author:
        for article_author in rnd_article.authors_parsed.iloc[0]:
            if article_author == dataset_author:
                found = True
                break
        if found: break
        rank += 1
    return article_author, rank

### Example 1
Pick a random article from the validate data set, find the rank of the nearest author in validate dataset.
Expect the rank to be low in the majority of cases. 

In [131]:
# pick a random paper
rnd_article = validate_df.sample()
closest_author, rank = rank_author(rnd_article, authors_validate_df)

# check
print(f"The original article \"{rnd_article.title.iloc[0]}\" authors are: {rnd_article.authors_parsed.iloc[0]}")
print(f"The closest author is: '{closest_author}', with distance rank {rank}.")

The original article "Bound states in the continuum in subwavelength emitter arrays" authors are: ['Huidobro, Paloma A., ', 'de Paz, Maria Blanco, ']
The closest author is: 'de Paz, Maria Blanco, ', with distance rank 0.


### Example 2
Pick a random article from the validate data set, by an author who is also in the train dataset. Find the nearest author in train dataset.

In [142]:
# Percentage of authors in validate dataset, that are also present in train dataset
authors_validate_set = set(authors_validate_df.author)
authors_train_set = set(authors_train_df.author)
authors_intersection = authors_validate_set.intersection(authors_train_set)
print(f"The validate dataset has {len(authors_validate_set)} authors, the train dataset has {len(authors_train_set)}, and {len(authors_intersection)} authors are in both datasets.")

The validate dataset has 101576 authors, the train dataset has 157213, and 50935 authors are in both datasets.


In [133]:
# get articles in validate dataset, written by authors who are in both datasets
intersection_ids = []
for i, row in validate_df.iterrows():
    for author in row.authors_parsed:
        # if at least one author is in both datasets, keep the article id
        if author in authors_intersection:
            intersection_ids.append(row.id)
            break

idx = [article_id in intersection_ids for article_id in validate_df.id]
intersection_articles_df = validate_df[idx]

print(f"{intersection_articles_df.shape[0]} out of {validate_df.shape[0]} articles in the validate dataset were written by authors present in the validate and the train datasets")
print(f"i.e. {intersection_articles_df.shape[0]/validate_df.shape[0]*100:.2f} %")

19005 out of 22632 articles in the validate dataset were written by authors present in the validate and the train datasets
i.e. 83.97 %


In [145]:
# pick a random paper
rnd_article = intersection_articles_df.sample()
closest_author, rank = rank_author(rnd_article, authors_train_df)

# check
print(f"The original article \"{rnd_article.title.iloc[0]}\" authors are: {rnd_article.authors_parsed.iloc[0]}")
print(f"The closest author is: '{closest_author}', with distance rank {rank}.")

The original article "Pseudo-scalar meson spectral properties in the chiral crossover region
  of QCD" authors are: ['Bala, Dibyendu, ', 'Kaczmarek, Olaf, ', 'Lowdon, Peter, ', 'Philipsen, Owe, ', 'Ueding, Tristan, ']
The closest author is: 'Kaczmarek, Olaf, ', with distance rank 505.
