# Test
* Load the tokenized abstracts of the test dataset.
* Extract the topics of each article applying the topic model, load the article metadata and merge with the topics into one data frame. 
* Find the authors closest to the articles in the topic space.
* Compute the probability that the closest author is one of the authors of the article.

In [28]:
import pandas as pd
import pickle
import os

In [29]:
DATA_PATH = '../data'
MODELS_PATH = '../models'

## Load the tokenized abstracts of the test dataset

In [30]:
# Load the dictionary
with open(os.path.join(DATA_PATH, 'dictionary.pickle'), 'rb') as handle:
    dictionary = tokenized_dataset = pickle.load(handle)

In [31]:
# Load the tokenized abstracts
def load_tokenized_dataset(file_name):
    path = os.path.join(DATA_PATH, file_name)
    with open(path, 'rb') as handle:
        tokenized_dataset = pickle.load(handle)
    return tokenized_dataset

corpus_test = load_tokenized_dataset("corpus_test.pickle")
corpus_validate = load_tokenized_dataset("corpus_validate.pickle")
corpus_train = load_tokenized_dataset("corpus_train.pickle")

## Extract the topics of each article applying the topic model.

In [32]:
# Load the topic model
with open(os.path.join(MODELS_PATH, 'topic_model.pickle'), 'rb') as handle:
    topic_model = pickle.load(handle)

In [33]:
def get_topic_details(topic_model, corpus):
    """
    Returns a list of pandas Series object of tuples. 
    Each tuple is a topic number and the topic probability for this entry in the corpus.
    Example: 
        [[(0, 0.22764261), (4, 0.14444388), (5, 0.62411755)],
         [(1, 0.024827635), (2, 0.3290665), (3, 0.6061594), (5, 0.03431195)],
         [(0, 0.06239689), (3, 0.03924617), (5, 0.8926314)],
         [(3, 0.09784623), (5, 0.89414346)],...
        ...]
    If for a given entry, the topic's probability is 0, then the topic is not included in the Series for this entry.
    """
    topic_details_list = []
    for row in topic_model[corpus]:
        topic_details_list.append(row)
    return topic_details_list

def get_topic_dataframe(topic_model, corpus):
    """
    Returns a data frame with a column for each topic in the topic model.
    Each row stands for an entry in the corpus, each value for the probability of thos topic for this entry.
    If for a given entry, the topic's probability is 0, the the value in the entry's column corresponding to the topic is also 0.
    Example:
             	0 	1 	2 	3 	4 	5
        0 	0.227641 	0.000000 	0.000000 	0.000000 	0.144445 	0.624118
        1 	0.000000 	0.024817 	0.329062 	0.606161 	0.000000 	0.034325
        2 	0.062392 	0.000000 	0.000000 	0.039281 	0.000000 	0.892601
        3 	0.000000 	0.000000 	0.000000 	0.097728 	0.000000 	0.894262
    """
    topic_details = get_topic_details(topic_model, corpus)
    topics_entries = []  # topics for all entries
    num_topics = len(topic_model.get_topics())  # number of topics in the model
    for row in topic_details:
        topics_entry = [0] * num_topics
        for entry in row:  # all topic probabilities for this entry
            topic_num = entry[0]  # the topic number
            topic_prob = entry[1]  # the topic probability            
            topics_entry[topic_num] = topic_prob
        topics_entries.append(topics_entry)
    return pd.DataFrame(topics_entries, columns=range(0, num_topics))

In [34]:
# Use the model to assign topics probabilities to all articles in test data set
topics_test_df = get_topic_dataframe(topic_model, corpus_test)
# Use the model to assign topics probabilities to all articles in validate data set
topics_validate_df = get_topic_dataframe(topic_model, corpus_validate)
# Use the model to assign topics probabilities to all articles in train data set
topics_train_df = get_topic_dataframe(topic_model, corpus_train)

## Load the article metadata and merge with the topics into one data frame

* load the article metadata
* Add names to the topic columns, i.e. topic_0, ...
* merge

In [42]:
def load_df(name):
    return pd.read_csv(
        os.path.join(DATA_PATH, name), 
        index_col=0, 
        converters={"authors_parsed": lambda x:[entry.replace("\'", '').replace('"', '').strip("[]") for entry in x.split('", "')]})

validate_df = load_df('arxiv_validate.csv')
test_df = load_df('arxiv_test.csv')
train_df = load_df('arxiv_train.csv')

In [43]:
num_topics = len(topic_model.get_topics())

for l in [topics_test_df, topics_validate_df, topics_train_df]:
        l.columns = ["topic_%d"%n for n in range(num_topics)]

In [44]:
def merge_df(article_df, topics_df):
    merged = article_df.reset_index(drop=True).join(topics_df.reset_index(drop=True))
    return merged

test_df = merge_df(test_df, topics_test_df)
validate_df = merge_df(validate_df, topics_validate_df)
train_df = merge_df(train_df, topics_train_df)

## Find the authors closest to the articles in the topic space

In [59]:
# load the topics by author
authors_validate_df = pd.read_csv(os.path.join(DATA_PATH, 'validate_topics_authors.csv'), index_col=0)
authors_test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_topics_authors.csv'), index_col=0)
authors_train_df = pd.read_csv(os.path.join(DATA_PATH, 'train_topics_authors.csv'), index_col=0)

In [39]:
import numpy as np

def topic_distance(topics1, topics2):
    """
    topic1, topic2: numpy.ndarray, representing the probability that an article is about a topic. An article can have multiple topics.
    Example:
        0    0.000000
        1    0.000000
        2    0.000000
        3    0.000000
        4    0.000000
        5    0.992668
    topic1 and topic2 must be the same length.
    Topics were assigned to articles in 03_assign_topics
    """
    dist = np.linalg.norm(topics1 - topics2)  # euclidean distance, L2 norm is default
    return dist

### Example 1
Pick a random article from the validate data set, find the nearest author in validate dataset.
Expect to guess one of the authors in the majority of cases. 

In [81]:
def guess_author(article, authors_df):
    """
    Guess the author of article within authors_df.
    
    article: pandas.DataFrame data for a single article
    authors_df: pandas.DataFrame data for all authors
    """
    # numpy array with topics probabilities for the article
    topics1 = np.array(article[["topic_%d"%n for n in range(num_topics)]])
    # numpy array of topics of all authors
    topics_authors_np = authors_df[authors_df.columns.drop('author')].to_numpy() 
    # compute distances between article topics and topics of all authors
    distances = [topic_distance(topics1, topics2) for topics2 in topics_authors_np]
    # find the closest author
    closest_author = authors_df.iloc[distances.index(min(distances))].author
    return closest_author

In [85]:
# pick a random paper
rnd_article = validate_df.sample()
closest_author = guess_author(rnd_article, authors_validate_df)

# check
print(f"The original article \"{rnd_article.title.iloc[0]}\" authors are: {rnd_article.authors_parsed.iloc[0]}")
print(f"The closest author is: {closest_author}")

The original article "Revealing the spatial nature of sublattice symmetry" authors are: ['Xiao, Rong, ', 'Zhao, Y. X., ']
The closest author is: Xiao, Rong, 


In [134]:
%%time

correct = 0
counter = 0
for _, article in validate_df.iterrows():
    if counter % 1000 == 0: print(f"Processing article {counter+1} / {validate_df.shape[0]}")
    closest_author = guess_author(article, authors_validate_df)
    if closest_author in article.authors_parsed:
        correct += 1
    counter += 1
print(f"Guessed the authors of {correct}/{validate_df.shape[0]} articles correctly, i.e. {correct/validate_df.shape[0]*100:.2f} %")

Processing article 1 / 22632
Processing article 1001 / 22632
Processing article 2001 / 22632
Processing article 3001 / 22632
Processing article 4001 / 22632
Processing article 5001 / 22632
Processing article 6001 / 22632
Processing article 7001 / 22632
Processing article 8001 / 22632
Processing article 9001 / 22632
Processing article 10001 / 22632
Processing article 11001 / 22632
Processing article 12001 / 22632
Processing article 13001 / 22632
Processing article 14001 / 22632
Processing article 15001 / 22632
Processing article 16001 / 22632
Processing article 17001 / 22632
Processing article 18001 / 22632
Processing article 19001 / 22632
Processing article 20001 / 22632
Processing article 21001 / 22632
Processing article 22001 / 22632
Guessed the authors of 18387/22632 articles correctly, i.e. 81.24 %
CPU times: user 1h 9min 39s, sys: 16.4 s, total: 1h 9min 55s
Wall time: 1h 9min 55s


### Example 2
Pick a random article from the validate data set, by an author who is also in the train dataset. Find the nearest author in train dataset.

In [88]:
# Percentage of authors in validate dataset, that are also present in train dataset
authors_validate_set = set(authors_validate_df.author)
authors_train_set = set(authors_train_df.author)
authors_intersection = authors_validate_set.intersection(authors_train_set)
print(f"The validate dataset has {len(authors_validate_set)}, the train dataset has {len(authors_train_set)}, and {len(authors_intersection)} are in both datasets.")

The validate dataset has 101576, the train dataset has 157213, and 50935 are in both datasets.


In [94]:
# get articles in validate dataset, written by authors who are in both datasets
intersection_ids = []
for i, row in validate_df.iterrows():
    for author in row.authors_parsed:
        # if at least one author is in both datasets, keep the article id
        if author in authors_intersection:
            intersection_ids.append(row.id)
            break

idx = [article_id in intersection_ids for article_id in validate_df.id]
intersection_articles_df = validate_df[idx]

print(f"{intersection_articles_df.shape[0]} out of {validate_df.shape[0]} articles in the validate dataset were written by authors present in the validate and the train datasets")
print(f"i.e. {intersection_articles_df.shape[0]/validate_df.shape[0]*100:.2f} %")

19018 out of 22632 articles in the validate dataset were written by authors present in the validate and the train datasets
i.e. 84.03 %


In [151]:
# pick a random paper in validate dataset, written by authors who are in both datasets
rnd_article = intersection_articles_df.sample()
# Find the nearest author in train dataset.
closest_author = guess_author(rnd_article, authors_train_df)

# check
print(f"The original article \"{rnd_article.title.iloc[0]}\" authors are: {rnd_article.authors_parsed.iloc[0]}")
print(f"The closest author is: {closest_author}")

The original article "Junctions, Edge Modes, and $G_2$-Holonomy Orbifolds" authors are: ['Acharya, Bobby Samir, ', 'Del Zotto, Michele, ', 'Heckman, Jonathan J., ', 'Hubner, Max, ', 'Torres, Ethan, ']
The closest author is: Jardim, I. C., 


In [153]:
%%time

correct = 0
counter = 0
for _, article in intersection_articles_df.iterrows():
    if counter % 1000 == 0: print(f"Processing article {counter+1} / {intersection_articles_df.shape[0]}")
    closest_author = guess_author(article, authors_train_df)
    if closest_author in article.authors_parsed:
        correct += 1
    counter += 1
print(f"Guessed the authors of {correct}/{validate_df.shape[0]} articles correctly, i.e. {correct/validate_df.shape[0]*100:.2f} %")

Processing article 1 / 19018
Processing article 1001 / 19018
Processing article 2001 / 19018
Processing article 3001 / 19018
Processing article 4001 / 19018
Processing article 5001 / 19018
Processing article 6001 / 19018
Processing article 7001 / 19018
Processing article 8001 / 19018
Processing article 9001 / 19018
Processing article 10001 / 19018
Processing article 11001 / 19018
Processing article 12001 / 19018
Processing article 13001 / 19018
Processing article 14001 / 19018
Processing article 15001 / 19018
Processing article 16001 / 19018
Processing article 17001 / 19018
Processing article 18001 / 19018
Processing article 19001 / 19018
Guessed the authors of 10/22632 articles correctly, i.e. 0.04 %
CPU times: user 1h 32min, sys: 16.3 s, total: 1h 32min 17s
Wall time: 1h 32min 17s


## Test

In [141]:
# Topics for the authors in the train dataset
# the topic_authors dataframes have an extra column for author names, so use the 5 first columns only
topics_authors_np = train_topics_authors.iloc[:, 0:5].to_numpy()  

def guess_author(article_topics):
    # numpy array with topics probabilities for the article
    topics1 = np.array(article)
    # compute distance from article topics to all authors topics in the train dataset
    distances = [topic_distance(topics1, topics2) for topics2 in topics_authors_np]
    # find closest author
    closest_author = train_topics_authors.iloc[distances.index(min(distances))]
    return closest_author

def guess_authors(article_topics_df, article_df):
    closest_authors = []
    for i, article in article_topics_df.iterrows():
        if i % 1000 == 0:
            print(f"Processing article {i+1}/{article_topics_df.shape[0]}")
        closest_authors.append(guess_author(article))
    return closest_authors

def check_guess(article_df, closest_authors):
    for i, article in article_df.iterrows():
        # check if correct
        check.append(closest_author[i].author in article.authors_parsed)

In [149]:
%%time

guess_author(topics_train_df.iloc[0])

#guesses = guess_authors(topics_train_df, train_df)
#check = []


CPU times: user 236 ms, sys: 23.8 ms, total: 260 ms
Wall time: 245 ms


0                  0.119213
1                       0.0
2                       0.0
3                       0.0
4                  0.870843
author    Snegirev, A. V., 
Name: 46206, dtype: object