# Assign topic distributions to authors
* List all authors in the dataset.
* For each author, merge the abstracts of all their articles, apply pre-processing filters and lemmatization.
* Finally, apply the trained LDA topic model to extract the topic distribution from each text and assign this topic distribution to each author. 

In [1]:
import pandas as pd
import pickle
import os
import arxiv_utils
import tok

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/kobv/atroncos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
DATA_PATH = '../data'
MODELS_PATH = '../models'

## Load the article abstracts and metadata for the train/validate/test datasets

In [3]:
def load_df(name):
    return pd.read_csv(
        os.path.join(DATA_PATH, name), 
        index_col=0, 
        converters={"authors_parsed": lambda x:[entry.replace("\'", '').replace('"', '').strip("[]") for entry in x.split('", "')]})

train_df = load_df('arxiv_train.csv')
validate_df = load_df('arxiv_validate.csv')
test_df = load_df('arxiv_test.csv')

## List all authors in the dataset

In [4]:
def flatten(xss):
    """Flatten a list of lists"""
    return [x for xs in xss for x in xs]

def get_unique_authors(df):
    """Given a dataframe, return unique authors"""
    authors = flatten(df['authors_parsed'])
    return set(authors)

def count_authors(df):
    """Given a dataframe, return count of unique authors"""
    return len(get_unique_authors(df))

In [5]:
train_authors = arxiv_utils.get_unique_authors(train_df)
validate_authors = arxiv_utils.get_unique_authors(validate_df)
test_authors = arxiv_utils.get_unique_authors(test_df)
print(f"The train dataset has {len(train_authors)} unique authors, the validate dataset {len(validate_authors)}, the test dataset {len(test_authors)}.")

The train dataset has 157213 unique authors, the validate dataset 101576, the test dataset 99444.


## For each author, merge the abstracts of all their articles

In [6]:
def get_abstracts_by_author(author, metadata_df):
    """
    Get the abstracts of all articles by this author.
    author: string
    metadata_df: pandas.DataFrame dataset
    returns: list<string>
    """
    idx = [author in authors_parsed for authors_parsed in metadata_df['authors_parsed']]
    abstracts = metadata_df[idx]['abstract']
    return list(abstracts)

def merge_abstracts_by_author(author_list, metadata_df):
    """
    Merge the abstracts of all articles by all authors.
    author_list: set<string> set of unique author names
    metadata_df: pandas.DataFrame dataset
    returns: list<string> merged abstracts
    """
    txts = []
    counter = 0
    for author in author_list:
        if counter % 25000 == 0:
            print(f"Processing author {counter}/{len(author_list)}")
        abstracts = get_abstracts_by_author(author, metadata_df)
        txt = ' '.join(abstracts)
        txts.append(txt)
        counter += 1
    return txts

In [7]:
%%time

print("Processing train dataset")
train_authors_txts = merge_abstracts_by_author(train_authors, train_df)
print("Processing validate dataset")
validate_authors_txts = merge_abstracts_by_author(validate_authors, validate_df)
print("Processing test dataset")
test_authors_txts = merge_abstracts_by_author(test_authors, test_df)

Processing train dataset
Processing author 0/157213
Processing author 25000/157213
Processing author 50000/157213
Processing author 75000/157213
Processing author 100000/157213
Processing author 125000/157213
Processing author 150000/157213
Processing validate dataset
Processing author 0/101576
Processing author 25000/101576
Processing author 50000/101576
Processing author 75000/101576
Processing author 100000/101576
Processing test dataset
Processing author 0/99444
Processing author 25000/99444
Processing author 50000/99444
Processing author 75000/99444
CPU times: user 18min 16s, sys: 975 ms, total: 18min 17s
Wall time: 18min 17s


## Apply pre-processing filters and lemmatization to the texts for each author

In [8]:
with open(os.path.join(DATA_PATH, 'dictionary.pickle'), 'rb') as handle:
    dictionary = pickle.load(handle)

In [9]:
%%time

def tokenize_dataset(dictionary, txt_list):
    _texts = tok.clean(txt_list)
    return(tok.make_corpus(dictionary, _texts))

print("Tokenizing train dataset")
train_corpus_authors = tokenize_dataset(dictionary, train_authors_txts)
print("Tokenizing validate dataset")
validate_corpus_authors = tokenize_dataset(dictionary, validate_authors_txts)
print("Tokenizing test dataset")
test_corpus_authors = tokenize_dataset(dictionary, test_authors_txts)

Tokenizing train dataset
Tokenizing validate dataset
Tokenizing test dataset
CPU times: user 2min 45s, sys: 1.25 s, total: 2min 47s
Wall time: 2min 47s


## Apply the trained LDA topic model

Load the topic model
Load the LDA topic model fitted in [03_fit_topic_model](./03_fit_topic_model.ipynb)

In [10]:
with open(os.path.join(MODELS_PATH, 'topic_model9.pickle'), 'rb') as handle:
    topic_model = pickle.load(handle)

Extract the topic distribution from each text and assign this topic distribution to each author.

In [11]:
def get_topic_details(topic_model, corpus):
    """
    Returns a list of pandas Series object of tuples. 
    Each tuple is a topic number and the topic probability for this entry in the corpus.
    Example: 
        [[(0, 0.22764261), (4, 0.14444388), (5, 0.62411755)],
         [(1, 0.024827635), (2, 0.3290665), (3, 0.6061594), (5, 0.03431195)],
         [(0, 0.06239689), (3, 0.03924617), (5, 0.8926314)],
         [(3, 0.09784623), (5, 0.89414346)],...
        ...]
    If for a given entry, the topic's probability is 0, then the topic is not included in the Series for this entry.
    """
    topic_details_list = []
    for row in topic_model[corpus]:
        topic_details_list.append(row)
    return topic_details_list

def get_topic_dataframe(topic_model, corpus):
    """
    Returns a data frame with a column for each topic in the topic model.
    Each row stands for an entry in the corpus, each value for the probability of thos topic for this entry.
    If for a given entry, the topic's probability is 0, the the value in the entry's column corresponding to the topic is also 0.
    Example:
             	0 	1 	2 	3 	4 	5
        0 	0.227641 	0.000000 	0.000000 	0.000000 	0.144445 	0.624118
        1 	0.000000 	0.024817 	0.329062 	0.606161 	0.000000 	0.034325
        2 	0.062392 	0.000000 	0.000000 	0.039281 	0.000000 	0.892601
        3 	0.000000 	0.000000 	0.000000 	0.097728 	0.000000 	0.894262
    """
    topic_details = get_topic_details(topic_model, corpus)
    topics_entries = []  # topics for all entries
    num_topics = len(topic_model.get_topics())  # number of topics in the model
    for row in topic_details:
        topics_entry = [0] * num_topics
        for entry in row:  # all topic probabilities for this entry
            topic_num = entry[0]  # the topic number
            topic_prob = entry[1]  # the topic probability            
            topics_entry[topic_num] = topic_prob
        topics_entries.append(topics_entry)
    return pd.DataFrame(topics_entries, columns=range(0, num_topics))

In [12]:
%%time

train_topics_authors = get_topic_dataframe(topic_model, train_corpus_authors)
validate_topics_authors = get_topic_dataframe(topic_model, validate_corpus_authors)
test_topics_authors = get_topic_dataframe(topic_model, test_corpus_authors)

CPU times: user 1min 14s, sys: 1min 19s, total: 2min 34s
Wall time: 59.6 s


## Add author names and save

In [13]:
train_topics_authors["author"] = list(train_authors)
validate_topics_authors["author"] = list(validate_authors)
test_topics_authors["author"] = list(test_authors)

In [14]:
train_topics_authors.to_csv(os.path.join(DATA_PATH, 'train_topics9_authors.csv'))
validate_topics_authors.to_csv(os.path.join(DATA_PATH, 'validate_topics9_authors.csv'))
test_topics_authors.to_csv(os.path.join(DATA_PATH, 'test_topics9_authors.csv'))

# Assign topic distributions to articles

In [15]:
# Load the tokenized abstracts
def load_tokenized_dataset(file_name):
    path = os.path.join(DATA_PATH, file_name)
    with open(path, 'rb') as handle:
        tokenized_dataset = pickle.load(handle)
    return tokenized_dataset

corpus_test = load_tokenized_dataset("corpus_test.pickle")
corpus_validate = load_tokenized_dataset("corpus_validate.pickle")
corpus_train = load_tokenized_dataset("corpus_train.pickle")

In [16]:
# Use the model to assign topics probabilities to all articles in test data set
topics_test_df = get_topic_dataframe(topic_model, corpus_test)
# Use the model to assign topics probabilities to all articles in validate data set
topics_validate_df = get_topic_dataframe(topic_model, corpus_validate)
# Use the model to assign topics probabilities to all articles in train data set
topics_train_df = get_topic_dataframe(topic_model, corpus_train)

## Load the article metadata and merge with the topics into one data frame

In [17]:
def load_df(name):
    return pd.read_csv(
        os.path.join(DATA_PATH, name), 
        index_col=0, 
        converters={"authors_parsed": lambda x:[entry.replace("\'", '').replace('"', '').strip("[]") for entry in x.split('", "')]})

validate_df = load_df('arxiv_validate.csv')
test_df = load_df('arxiv_test.csv')
train_df = load_df('arxiv_train.csv')

In [20]:
num_topics = len(topic_model.get_topics())

for l in [topics_test_df, topics_validate_df, topics_train_df]:
        l.columns = ["topic9_%d"%n for n in range(num_topics)]

In [21]:
def merge_df(article_df, topics_df):
    merged = article_df.reset_index(drop=True).join(topics_df.reset_index(drop=True))
    return merged

test_df = merge_df(test_df, topics_test_df)
validate_df = merge_df(validate_df, topics_validate_df)
train_df = merge_df(train_df, topics_train_df)

In [22]:
train_df.to_csv(os.path.join(DATA_PATH, 'train_topics9.csv'))
validate_df.to_csv(os.path.join(DATA_PATH, 'validate_topics9.csv'))
test_df.to_csv(os.path.join(DATA_PATH, 'test_topics9.csv'))