# Filter, tokenize and split the data

__Filter__
* Load the data prepared in ../00_process_snapshot.ipynb
* Filter by year and subject, count the number of authors

__Split__
* Split data into train/validate/test

__Tokenize__
* Apply pre-processing filters
* Apply lemmatization
* Save tokenized corpus, one for each train/validate/test dataset on NLP

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tok
import zipfile as zf
import pickle
import os

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/kobv/atroncos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
DATA_PATH = '../data'

## Filter
Load the data prepared in ../00_process_snapshot.ipynb

In [3]:
arxiv_df = pd.read_csv(
    os.path.join(DATA_PATH, 'arxiv_metadata.csv'), 
    converters={"authors_parsed": lambda x:[entry.strip('[]') for entry in  x.split("], ")]}, 
    index_col=0)

  arxiv_df = pd.read_csv(


* Filter by year: keep only articles submitted in the period considered
* Filter by subject: choose 'Physics'

In [9]:
idx = arxiv_df['year'] >= 2023
filtered_df = arxiv_df[idx]
idx = filtered_df['Physics'] == True
filtered_df = filtered_df[idx]
print(f"The filtered data set has {filtered_df.shape[0]} entries.")

The filtered data set has 90530 entries.


Count authors

In [10]:
def flatten(xss):
    """Flatten a list of lists"""
    return [x for xs in xss for x in xs]

def get_unique_authors(df):
    """Given a dataframe, return unique authors"""
    authors = flatten(df['authors_parsed'])
    return set(authors)

def count_authors(df):
    """Given a dataframe, return count of unique authors"""
    return len(get_unique_authors(df))

In [11]:
# ALL AUTHORS
count_all_authors = count_authors(filtered_df)
print(f"The filtered data set has {count_all_authors} unique authors.")

The filtered data set has 246443 unique authors.


## Split data into train/validate/test

"train"

    A percent of the texts reserved for fitting the model.

"validate"

    A percent of the texts reserved for computing perplexity when fitting the model's k-parameter, and searching for best parameters.

"test"

    A percent of the texts reserved for testing hypotheses.


In [12]:
def split(df):
    train, test = train_test_split(df, test_size=0.5)
    validate, test = train_test_split(test, test_size=0.5)
    return(train, validate, test)

In [13]:
train_df, validate_df, test_df = split(filtered_df)
print(f"The train dataset has {train_df.shape[0]} rows, the validate dataset {validate_df.shape[0]} rows, the test dataset {test_df.shape[0]} rows")

The train dataset has 45265 rows, the validate dataset 22632 rows, the test dataset 22633 rows


In [14]:
train_df["authors_parsed"].iloc[0]

["'Hani', 'Zaher', ''", "'Shatah', 'Jalal', ''", "'Zhu', 'Hui', ''"]

## Save article data splits

In [9]:
train_df.to_csv(os.path.join(DATA_PATH, 'arxiv_train.csv'))
validate_df.to_csv(os.path.join(DATA_PATH, 'arxiv_validate.csv'))
test_df.to_csv(os.path.join(DATA_PATH, 'arxiv_test.csv'))

## Tokenize

Apply pre-processing filters: strip_tags, strip_punctuation, strip_multiple_whitespaces, stric_numeric, remove_stopwords; strip_short

Apply lemmatization to the list of words.

The tokenize functions are in the tok.py package provided in the same directory as the notebooks.

see: https://github.com/piskvorky/gensim/blob/develop/gensim/parsing/preprocessing.py


In [10]:
# make a dictionary with all the words in the dataset
texts = tok.clean(filtered_df['abstract'])
dictionary = tok.make_dictionary(texts)

In [11]:
def tokenize_dataset(dictionary, df):
    _texts = tok.clean(df['abstract'])
    return(tok.make_corpus(dictionary, _texts))

def tokenize_datasets(dictionary, train, validate, test):
    corpus_train = tokenize_dataset(dictionary, train)
    corpus_validate = tokenize_dataset(dictionary, validate)
    corpus_test = tokenize_dataset(dictionary, test)
    return(corpus_train, corpus_validate, corpus_test)

In [12]:
# Tokenize train, validate and test data sets
corpus_train, corpus_validate, corpus_test = tokenize_datasets(dictionary, train_df, validate_df, test_df)

## Save tokenized data

In [13]:
# save the dictionary
with open(os.path.join(DATA_PATH, 'dictionary.pickle'), 'wb') as handle:
    pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
def save_tokenized_dataset(path, obj):
    with open(path, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def save_tokenized_datasets(train, validate, test):
    path_train = os.path.join(DATA_PATH, f"corpus_train.pickle")
    save_tokenized_dataset(path_train, train)

    path_validate = os.path.join(DATA_PATH, f"corpus_validate.pickle")
    save_tokenized_dataset(path_validate, validate)

    path_test = os.path.join(DATA_PATH, f"corpus_test.pickle")
    save_tokenized_dataset(path_test, test)

In [15]:
save_tokenized_datasets(corpus_train, corpus_validate, corpus_test)