# Test
* Find the authors closest to the articles in the topic space.
* Compute the probability that the closest author is one of the authors of the article.

In [25]:
import pandas as pd
import pickle
import os

In [26]:
DATA_PATH = '../data'
MODELS_PATH = '../models'

## load the article metadata

In [60]:
def load_df(name):
    return pd.read_csv(
        os.path.join(DATA_PATH, name), 
        index_col=0, 
        converters={"authors_parsed": lambda x:[entry.replace("'", '').strip("[]") for entry in x.split("', '")]}
    )

validate_df = load_df('validate_topics9.csv')
test_df = load_df('test_topics9.csv')
train_df = load_df('train_topics9.csv')

## Find the authors closest to the articles in the topic space

In [28]:
# load the topics by author
authors_validate_df = pd.read_csv(os.path.join(DATA_PATH, 'validate_topics9_authors.csv'), index_col=0)
authors_test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_topics9_authors.csv'), index_col=0)
authors_train_df = pd.read_csv(os.path.join(DATA_PATH, 'train_topics9_authors.csv'), index_col=0)

In [29]:
import numpy as np

def topic_distance(topics1, topics2):
    """
    topic1, topic2: numpy.ndarray, representing the probability that an article is about a topic. An article can have multiple topics.
    Example:
        0    0.000000
        1    0.000000
        2    0.000000
        3    0.000000
        4    0.000000
        5    0.992668
    topic1 and topic2 must be the same length.
    Topics were assigned to articles in 03_assign_topics
    """
    dist = np.linalg.norm(topics1 - topics2)  # euclidean distance, L2 norm is default
    return dist

Load the topic model
Load the LDA topic model fitted in [03_fit_topic_model](./03_fit_topic_model.ipynb)

In [30]:
with open(os.path.join(MODELS_PATH, 'topic_model9.pickle'), 'rb') as handle:
    topic_model = pickle.load(handle)

num_topics = len(topic_model.get_topics())  # number of topics in the model

### Example 1
Pick a random article from the validate data set, find the nearest author in validate dataset.
Expect to guess one of the authors in the majority of cases. 

In [35]:
def guess_author(article, authors_df, topics_authors_np):
    """
    Guess the author of article within authors_df.
    
    article: pandas.DataFrame data for a single article
    authors_df: pandas.DataFrame data for all authors
    authors_np: numpy.array topics of all authors
    """
    # numpy array with topics probabilities for the article
    topics1 = np.array(article[["topic9_%d"%n for n in range(num_topics)]])
    # compute distances between article topics and topics of all authors
    distances = [topic_distance(topics1, topics2) for topics2 in topics_authors_np]
    # find the closest author
    closest_author = authors_df.iloc[distances.index(min(distances))].author
    return closest_author

In [61]:
# pick a random paper
rnd_article = validate_df.sample()
topics_authors_validate_np = authors_validate_df[authors_validate_df.columns.drop('author')].to_numpy() 
closest_author = guess_author(rnd_article, authors_validate_df, topics_authors_validate_np)

# check
print(f"The original article \"{rnd_article.title.iloc[0]}\" authors are: {rnd_article.authors_parsed.iloc[0]}")
print(f"The closest author is: {closest_author}")

The original article "Electrical Tuning of Neutral and Charged Excitons with 1-nm Gate" authors are: ['Almutlaq, Jawaher, ', 'Wang, Jiangtao, ', 'Li, Linsen, ', 'Li, Chao, ', 'Dang, Tong, ', 'Bulović, Vladimir, ', 'Kong, Jing, ', 'Englund, Dirk, ']
The closest author is: Bulović, Vladimir, 


In [65]:
closest_author in rnd_article.authors_parsed.iloc[0]

True

In [74]:
%%time

MAX_SAMPLES = 100
correct = 0
topics_authors_validate_np = authors_validate_df[authors_validate_df.columns.drop('author')].to_numpy() 
for i in range(MAX_SAMPLES):
    rnd_article = validate_df.sample()
    closest_author = guess_author(rnd_article, authors_validate_df, topics_authors_validate_np)
    if closest_author in rnd_article.authors_parsed.iloc[0]:
        correct += 1
print(f"Guessed the authors of {correct}/{MAX_SAMPLES} articles correctly, i.e. {correct/MAX_SAMPLES*100:.2f} %")

Guessed the authors of 85/100 articles correctly, i.e. 85.00 %
CPU times: user 17.1 s, sys: 0 ns, total: 17.1 s
Wall time: 17.1 s


### Example 2
Pick a random article from the validate data set, by an author who is also in the train dataset. Find the nearest author in train dataset.

In [70]:
# Percentage of authors in validate dataset, that are also present in train dataset
authors_validate_set = set(authors_validate_df.author)
authors_train_set = set(authors_train_df.author)
authors_intersection = authors_validate_set.intersection(authors_train_set)
print(f"The validate dataset has {len(authors_validate_set)}, the train dataset has {len(authors_train_set)}, and {len(authors_intersection)} are in both datasets.")

The validate dataset has 101576, the train dataset has 157213, and 50935 are in both datasets.


In [71]:
# get articles in validate dataset, written by authors who are in both datasets
intersection_ids = []
for i, row in validate_df.iterrows():
    for author in row.authors_parsed:
        # if at least one author is in both datasets, keep the article id
        if author in authors_intersection:
            intersection_ids.append(row.id)
            break

idx = [article_id in intersection_ids for article_id in validate_df.id]
intersection_articles_df = validate_df[idx]

print(f"{intersection_articles_df.shape[0]} out of {validate_df.shape[0]} articles in the validate dataset were written by authors present in the validate and the train datasets")
print(f"i.e. {intersection_articles_df.shape[0]/validate_df.shape[0]*100:.2f} %")

19005 out of 22632 articles in the validate dataset were written by authors present in the validate and the train datasets
i.e. 83.97 %


In [101]:
# pick a random paper
rnd_article = intersection_articles_df.sample()
topics_authors_train_np = authors_train_df[authors_train_df.columns.drop('author')].to_numpy() 
closest_author = guess_author(rnd_article, authors_train_df, topics_authors_train_np)

# check
print(f"The original article \"{rnd_article.title.iloc[0]}\" authors are: {rnd_article.authors_parsed.iloc[0]}")
print(f"The closest author is: {closest_author}")

The original article "Iterative Methods for an Inverse Eddy Current Problem with Total
  Variation Regularization" authors are: ['Chen, Junqing, ', 'Long, Zehao, ']
The closest author is: Kobayashi, Miki, 


In [153]:
%%time

MAX_SAMPLES = 100
correct = 0
for i in range(MAX_SAMPLES):
    rnd_article = validate_df.sample()
    topics_authors_validate_np = authors_validate_df[authors_validate_df.columns.drop('author')].to_numpy() 
    closest_author = guess_author(rnd_article, authors_validate_df, topics_authors_validate_np)
    if closest_author in rnd_article.authors_parsed.iloc[0]:
        correct += 1
print(f"Guessed the authors of {correct}/{MAX_SAMPLES} articles correctly, i.e. {correct/MAX_SAMPLES*100:.2f} %")

Processing article 1 / 19018
Processing article 1001 / 19018
Processing article 2001 / 19018
Processing article 3001 / 19018
Processing article 4001 / 19018
Processing article 5001 / 19018
Processing article 6001 / 19018
Processing article 7001 / 19018
Processing article 8001 / 19018
Processing article 9001 / 19018
Processing article 10001 / 19018
Processing article 11001 / 19018
Processing article 12001 / 19018
Processing article 13001 / 19018
Processing article 14001 / 19018
Processing article 15001 / 19018
Processing article 16001 / 19018
Processing article 17001 / 19018
Processing article 18001 / 19018
Processing article 19001 / 19018
Guessed the authors of 10/22632 articles correctly, i.e. 0.04 %
CPU times: user 1h 32min, sys: 16.3 s, total: 1h 32min 17s
Wall time: 1h 32min 17s


## Test

In [141]:
# Topics for the authors in the train dataset
# the topic_authors dataframes have an extra column for author names, so use the 5 first columns only
topics_authors_np = train_topics_authors.iloc[:, 0:5].to_numpy()  

def guess_author(article_topics):
    # numpy array with topics probabilities for the article
    topics1 = np.array(article)
    # compute distance from article topics to all authors topics in the train dataset
    distances = [topic_distance(topics1, topics2) for topics2 in topics_authors_np]
    # find closest author
    closest_author = train_topics_authors.iloc[distances.index(min(distances))]
    return closest_author

def guess_authors(article_topics_df, article_df):
    closest_authors = []
    for i, article in article_topics_df.iterrows():
        if i % 1000 == 0:
            print(f"Processing article {i+1}/{article_topics_df.shape[0]}")
        closest_authors.append(guess_author(article))
    return closest_authors

def check_guess(article_df, closest_authors):
    for i, article in article_df.iterrows():
        # check if correct
        check.append(closest_author[i].author in article.authors_parsed)

In [149]:
%%time

guess_author(topics_train_df.iloc[0])

#guesses = guess_authors(topics_train_df, train_df)
#check = []


CPU times: user 236 ms, sys: 23.8 ms, total: 260 ms
Wall time: 245 ms


0                  0.119213
1                       0.0
2                       0.0
3                       0.0
4                  0.870843
author    Snegirev, A. V., 
Name: 46206, dtype: object