# Filter and split the data

__Filter__
* Load the data prepared in ../00_process_snapshot.ipynb
* Filter by year and subject, count the number of authors

__Split__
* Split data into train/validate/test
* Save datasets


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import zipfile as zf
import pickle
import os

In [3]:
DATA_PATH = '../data'

## Filter
Load the data prepared in ../00_process_snapshot.ipynb

In [18]:
arxiv_df = pd.read_csv(
    os.path.join(DATA_PATH, 'arxiv_metadata.csv'), 
    converters={"authors_parsed": lambda x:[entry.strip('[]').replace("'", "") for entry in  x.split("], ")]}, 
    index_col=0)

  arxiv_df = pd.read_csv(


* Filter by year: keep only articles submitted in the period considered
* Filter by subject: choose 'Physics'

In [19]:
idx = arxiv_df['year'] >= 2023
filtered_df = arxiv_df[idx]
idx = filtered_df['Physics'] == True
filtered_df = filtered_df[idx]
print(f"The filtered data set has {filtered_df.shape[0]} entries.")

The filtered data set has 90530 entries.


Count authors

In [20]:
def flatten(xss):
    """Flatten a list of lists"""
    return [x for xs in xss for x in xs]

def get_unique_authors(df):
    """Given a dataframe, return unique authors"""
    authors = flatten(df['authors_parsed'])
    return set(authors)

def count_authors(df):
    """Given a dataframe, return count of unique authors"""
    return len(get_unique_authors(df))

In [21]:
# ALL AUTHORS
count_all_authors = count_authors(filtered_df)
print(f"The filtered data set has {count_all_authors} unique authors.")

The filtered data set has 246443 unique authors.


## Split data into train/validate/test

"train"

    A percent of the texts reserved for fitting the model.

"validate"

    A percent of the texts reserved for computing perplexity when fitting the model's k-parameter, and searching for best parameters.

"test"

    A percent of the texts reserved for testing hypotheses.


In [22]:
def split(df):
    train, test = train_test_split(df, test_size=0.5)
    validate, test = train_test_split(test, test_size=0.5)
    return(train, validate, test)

In [23]:
train_df, validate_df, test_df = split(filtered_df)
print(f"The train dataset has {train_df.shape[0]} rows, the validate dataset {validate_df.shape[0]} rows, the test dataset {test_df.shape[0]} rows")

The train dataset has 45265 rows, the validate dataset 22632 rows, the test dataset 22633 rows


## Save article data splits

In [27]:
train_df.to_csv(os.path.join(DATA_PATH, 'arxiv_train.csv'))
validate_df.to_csv(os.path.join(DATA_PATH, 'arxiv_validate.csv'))
test_df.to_csv(os.path.join(DATA_PATH, 'arxiv_test.csv'))