# Examine data

In [1]:
import pandas as pd
import pickle
import os
from sklearn.dummy import DummyClassifier

In [2]:
DATA_PATH = '../data'
MODELS_PATH = '../models'

## Load the article metadata
Load the article abstracts and metadata for the train/validate/test datasets

In [69]:
def load_df(name):
    return pd.read_csv(os.path.join(DATA_PATH, name), index_col=0, converters={"authors_parsed": lambda x:[entry.strip('[]') for entry in  x.split("], ")]})

train_df = load_df('arxiv_train.csv')
validate_df = load_df('arxiv_validate.csv')
test_df = load_df('arxiv_test.csv')

In [70]:
def flatten(xss):
    """Flatten a list of lists"""
    return [x for xs in xss for x in xs]

def get_unique_authors(df):
    """Given a dataframe, return unique authors"""
    authors = flatten(df['authors_parsed'])
    return set(authors)

def count_authors(df):
    """Given a dataframe, return count of unique authors"""
    return len(get_unique_authors(df))

total_article_count = train_df.shape[0] + validate_df.shape[0] + test_df.shape[0]
total_author_count = count_authors(pd.concat([train_df, validate_df, test_df], axis=0))
print(f"The dataset contains metadata for {total_article_count} articles, written by {total_author_count} authors.")

The dataset contains metadata for 70000 articles, written by 207521 authors.


## Assumptions
### 1. Authors can be identified by their name.

In [71]:
# the 10 authors from the "submitter" column that have written the most
train_df.submitter.value_counts()[:10]

submitter
The ATLAS Collaboration        64
The CMS Collaboration          30
ALICE publications             16
Xin Wang                       14
Wen-Jie Liu                    14
Pierre-Fran\c{c}ois Loos Dr    13
Noam Soker                     10
Mouhssine Koussour             10
Taichi Kato                    10
Yongqiang Wang                  9
Name: count, dtype: int64

* Among the authors with most papers, the 4 most prolific are collaborations, that do not name individuals.
* Examining article metadata, it appears that the remaining authors in the list a super prolific individuals, e.g. [Pierre-Francois Loos](https://scholar.google.com.tr/citations?user=siH_NhoAAAAJ&hl=en) (CNRS senior researcher)

In [72]:
# the 10 authors from the "authors" column that have written the most
train_df.authors.value_counts()[:10]

authors
ATLAS Collaboration                          60
CMS Collaboration                            29
ALICE Collaboration                          25
Athul Pradeepkumar Girija                     8
The Event Horizon Telescope Collaboration     7
The STAR Collaboration                        7
Shinichi Saito                                7
Paul C Bressloff                              6
Noam Soker (Technion, Israel)                 6
Pierre Naz\'e                                 6
Name: count, dtype: int64

In [38]:
# one very prolific submitter:
#pd.set_option('display.max_colwidth', None) 
#train_df[train_df.submitter == 'Pierre-Fran\c{c}ois Loos Dr'][['authors_parsed', 'title', 'year']]

### 2. Guessing the author of a paper is only possible if the author has already written at least one other paper
Seems obvious, since their names have to be in the data. So looking at the validation dataset, how many authors are present in the training dataset?
The datasets are large enough that looking at the data should give a decent approximation of how many authors could be guessed by an ideal model.  

In [131]:
# get a list of unique authors in the validate dataset
val_unique_authors = get_unique_authors(validate_df)
print(f"There are {len(val_unique_authors)} unique authors in the validate dataset.")
# get a list of unique authors in the train dataset
train_unique_authors = get_unique_authors(train_df)
print(f"There are {len(train_unique_authors)} unique authors in the train dataset.")
# count authors that are in both datasets
intersection = val_unique_authors & train_unique_authors
intersection_perc = len(intersection) / len(val_unique_authors) * 100
# count papers in the validation dataset written by these authors
papers_intersection_count = 0
for paper_authors in validate_df['authors_parsed']:
    for author in intersection:
        if author in paper_authors: 
            papers_intersection_count += 1
            break
papers_intersection_perc = papers_intersection_count / validate_df.shape[0] * 100
print(f"{len(intersection)} authors are present in both datasets.")
print(f"Authors present in both datasets wrote {papers_intersection_count} out of {validate_df.shape[0]} papers in the validate dataset, \
so {papers_intersection_perc:.0f}% is approximately the best possible success rate for guessing the author of a paper in this dataset.")

There are 81435 unique authors in the validate dataset.
There are 132976 unique authors in the train dataset.
40900 authors are present in both datasets.
Authors present in both datasets wrote 14130 out of 17500 papers in the validate dataset, so 81% is approximately the best possible success rate for guessing the author of a paper in this dataset.
