# Test

In [73]:
import pandas as pd
import pickle
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [74]:
DATA_PATH = '../data'
MODELS_PATH = '../models'

Load the article metadata

In [75]:
def load_df(name):
    # make sure the authors_parsed column contains arrays of str, 1 str per author name
    # e.g. ['Bognár, Zs., ', 'Sódor, Á., ', 'Clark, I. R., ', 'Kawaler, S. D., ']
    return pd.read_csv(
        os.path.join(DATA_PATH, name), 
        index_col=0, 
        converters={"authors_parsed": lambda x:[entry.replace('\'', '').strip("[]") for entry in x.split("', '")]}
    )

train_df = load_df('arxiv_train.csv')
validate_df = load_df('arxiv_validate.csv')
test_df = load_df('arxiv_test.csv')

Load the article topics

In [76]:
with open(os.path.join(DATA_PATH, 'train_topic_distr_BERT_scikit.pickle'), 'rb') as handle:
    train_topic_distr = pickle.load(handle)
with open(os.path.join(DATA_PATH, 'validate_topic_distr_BERT_scikit.pickle'), 'rb') as handle:
    validate_topic_distr = pickle.load(handle)
with open(os.path.join(DATA_PATH, 'test_topic_distr_BERT_scikit.pickle'), 'rb') as handle:
    test_topic_distr = pickle.load(handle)

Merge metadata and topics into a data frame

In [77]:
def merge_df(article_df, topics_np):
    """
    Add topic distributions to article metadata.
    
    article_df: pandas.DataFrame article metadata
    topics_np: numpy.array topics distributions
    return pandas.DataFrame
    """
    topics_df = pd.DataFrame(topics_np)
    topics_df.columns = ["topic_%d"%n for n in range(len(topics_df.columns))]
    merged = article_df.reset_index(drop=True).join(topics_df.reset_index(drop=True))
    return merged

In [78]:
train_df = merge_df(train_df, train_topic_distr)
validate_df = merge_df(validate_df, validate_topic_distr)
test_df = merge_df(test_df, test_topic_distr)

## Article distances

In [81]:
import numpy as np

def topic_distance(article1, article2):
    """
    Compute the distance between 2 articles.
    article1, article2: pandas.Dataframe
    return float
    """
    # numpy array with topics probabilities for the article
    mask1 = article1.columns.str.contains("topic_.")
    topics1 = np.array(article1.loc[:, mask1])
    mask2 = article2.columns.str.contains("topic_.")
    topics2 = np.array(article2.loc[:, mask2])
    
    dist = np.linalg.norm(topics1 - topics2)  # euclidean distance, L2 norm is default
    return dist

### Tests

#### Test1
The distance between same articles should be 0

In [85]:
# pick a random paper
rnd_article = validate_df.sample()
assert topic_distance(rnd_article, rnd_article) == 0, "Distance between same articles should be 0"

#### Test2
The distance between different articles should be a non-zero positive number

In [101]:
while True:
    rnd_article1 = validate_df.sample()
    rnd_article2 = validate_df.sample()
    if rnd_article1.id.values[0] != rnd_article2.id.values[0]: break

assert topic_distance(rnd_article1, rnd_article2) > 0, "Distance between different articles should be greater than 0"

#### Test3
The distance between articles by the same author(s) should be on average smaller than that between rtickles by different authors (that is the essential assumption of the experiment).

In [188]:
# pairs of articles with at least one common author 
same_author = []
# pairs of articles with no author in common 
not_same_author = []

def article_with_common_author(article, df):
    """
    Get a different article with at least one common author in the dataset df.
    article: pandas.DataFrame 
    df: pandas.DataFrame 
    returns: pandas.DataFrame 
    """
    article_found = None
    for author in article.authors_parsed.iloc[0]:
        for i,row in df.iterrows():
            if author in row.authors_parsed:
                article_found = df[df.id == row.id]
                break
    return article_found

In [195]:
rnd_article = validate_df.sample()
rnd_article2 = article_with_common_author(rnd_article, validate_df)


In [196]:
rnd_article.authors_parsed

2793    [López-Gallifa, Á., , Rivilla, V. M., , Beltrá...
Name: authors_parsed, dtype: object

In [197]:
rnd_article2

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,...,topic_274,topic_275,topic_276,topic_277,topic_278,topic_279,topic_280,topic_281,topic_282,topic_283
2793,2403.02191,\'Alvaro L\'opez-Gallifa,"\'A. L\'opez-Gallifa, V. M. Rivilla, M. T. Bel...",The GUAPOS project. V: The chemical ingredient...,Accepted for publication in Monthly Notices of...,,,,astro-ph.GA,http://arxiv.org/licenses/nonexclusive-distrib...,...,0.0,0.012689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
