# Clean and tokenize the data

Clean
* Load the metadata prepared in `../00_load_metadata.ipynb`
* Load abstracts prepared in `../00_load_abstracts.ipynb`
* Filter papers by category and keyword:
  * arXiv category 'cs.CL' (Computation and Language)
  * astro-ph (Astrophysics) and keyword "Gravitatinal Waves"
  * q-bio.GN (Genomics) and keyword: “CRISPR”
  * Merge with metadata dataframe, check that all entries have an abstract
  * Filter by keyword

Split data 
* Split data into train/validate/test
* Save the clean data on NLP

Tokenize
* Aplly pre-processing filters
* Apply lemmatization
* Save tokenized corpus, one for each train/validate/test dataset on NLP

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tok
import zipfile as zf
import pickle
import os

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/kobv/atroncos/nltk_data...


In [2]:
DATA_PATH = '../data'

## Clean

### Load the metadata prepared in `../00_load_metadata.ipynb`

In [3]:
%%time

# Load the metadata downloaded from arXiv
arxiv_metadata = pd.read_csv(os.path.join(DATA_PATH, 'arxiv_metadata.csv.zip'), index_col=0)



CPU times: user 8.23 s, sys: 863 ms, total: 9.09 s
Wall time: 9.09 s


### Load abstracts prepared in `../00_load_abstracts.ipynb`, merge with metadata dataframe

In [4]:
%%time

# load abstracts extracted data in notebook 00_load_abstracts
arxiv_abstracts = pd.read_csv(os.path.join(DATA_PATH, 'arxiv_abstracts.csv.zip'), index_col=0)

CPU times: user 11 s, sys: 798 ms, total: 11.8 s
Wall time: 11.8 s




### Filter papers by category:
  * arXiv category 'cs.CL' (Computation and Language)
  * gr-qc (General Relativity and Quantum Cosmology)
  * q-bio.GN (Genomics)

In [5]:
def filter_by_category(df, categories):
    """Filter a dataframe with arXiv metadata using an arXiv category."""
    _idx = []
    for subject in arxiv_metadata['categories']:
        _idx.append(sum([category in subject for category in categories]) > 0)
    return(df[_idx])

In [6]:
# Papers on Computation and Language: original category 'cs.CL'
arxiv_cscl = filter_by_category(arxiv_metadata, ['cs.CL'])

# Papers on General Relativity and Quantum Cosmology: original category 'astro-ph'
arxiv_astro = filter_by_category(arxiv_metadata, ['astro-ph', 'gr-qc', 'hep-ph', 'physics.ins-det'])

# Papers on Genomics: original category 'q-bio.GN'
arxiv_gen = filter_by_category(arxiv_metadata, ['cs', 'q-bio', 'stat', 'physics.bio-ph'])

### Merge with metadata dataframe, check that all entries have an abstract

In [7]:
def merge_metadata_abstracts(df_metadata, df_abstracts):
    # merge metadata and abstracts dataframes, keep row only if both present
    df_temp = df_abstracts[df_abstracts.id.isin(df_metadata.id)]
    df_merged = pd.merge(df_metadata, df_temp, on='id')
    # check that all entries have an abstract
    idx = df_merged['abstract'].isna()
    df_merged = df_merged[~idx]
    return(df_merged)

In [8]:
# merge papers on Computation and Language
arxiv_cscl_merged = merge_metadata_abstracts(arxiv_cscl, arxiv_abstracts)

# merge papers on Astrophysics
arxiv_astro_merged = merge_metadata_abstracts(arxiv_astro, arxiv_abstracts)

# merge papers on General Relativity and Quantum Cosmology
arxiv_gen_merged = merge_metadata_abstracts(arxiv_gen, arxiv_abstracts)

### Filter by keyword

In [9]:
def filter_by_keyword(df, keyword):
    """Filter a dataframe with arXiv metadata and abstract using a keyword search in the abstract."""
    keyword = keyword.lower()
    _idx = [keyword in abstract.lower() for abstract in df['abstract']]
    _filtered = df[_idx]
    return(_filtered)

In [10]:
# Papers on gravitational waves
arxiv_gw_merged = filter_by_keyword(arxiv_astro_merged, "gravitational wave")

# Papers on CRISPR
arxiv_crispr_merged = filter_by_keyword(arxiv_gen_merged, "CRISPR")

In [11]:
print(f"There are {len(arxiv_cscl_merged)} research papers on 'Computer and Language' in the dataset.")
print(f"There are {len(arxiv_gw_merged)} research papers on 'Gravitational Waves' in the dataset.")
print(f"There are {len(arxiv_crispr_merged)} research papers on 'CRISPR' in the dataset.")

There are 54567 research papers on 'Computer and Language' in the dataset.
There are 14779 research papers on 'Gravitational Waves' in the dataset.
There are 53 research papers on 'CRISPR' in the dataset.


## Split the data into train / validate / test datasets

"train"
> A percent of the texts reserved for fitting the model.

"validate"
> A percent of the texts reserved for computing perplexity when fitting the model's k-parameter, and searching for best parameters.

"test"
> A percent of the texts reserved for testing hypotheses.

In [12]:
def split(df):
    train, test = train_test_split(df, test_size=0.5)
    validate, test = train_test_split(test, test_size=0.5)
    return(train, validate, test)

In [13]:
# split papers on Computation and Language
train_cscl, validate_cscl, test_cscl = split(arxiv_cscl_merged)
print(f"The train dataset for cs.CL has {train_cscl.shape[0]} rows, the validate dataset {validate_cscl.shape[0]} rows, the test dataset {test_cscl.shape[0]} rows")

The train dataset for cs.CL has 27283 rows, the validate dataset 13642 rows, the test dataset 13642 rows


In [14]:
# split papers on Gravitational Waves
train_gw, validate_gw, test_gw = split(arxiv_gw_merged)
print(f"The train dataset for gw has {train_gw.shape[0]} rows, the validate dataset {validate_gw.shape[0]} rows, the test dataset {test_gw.shape[0]} rows")

The train dataset for gw has 7389 rows, the validate dataset 3695 rows, the test dataset 3695 rows


### Save cleaned data

In [15]:
def save_dataset(filename, path, df):
    with zf.ZipFile(path, 'w') as ziparchive:
        ziparchive.writestr(filename, df.to_csv())

def save_datasets(name, train, validate, test):
    filename_train = f"{name}_train.csv"
    path_train = os.path.join(DATA_PATH, f"{filename_train}.zip")
    save_dataset(filename_train, path_train, train)
    
    filename_validate = f"{name}_validate.csv"
    path_validate = os.path.join(DATA_PATH, f"{filename_validate}.zip")
    save_dataset(filename_validate, path_validate, validate)

    filename_test = f"{name}_test.csv"
    path_test = os.path.join(DATA_PATH, f"{filename_test}.zip")
    save_dataset(filename_test, path_test, test)


In [16]:
# save the cleaned data on Computation and Language
save_datasets('cscl', train_cscl, validate_cscl, test_cscl)

# save the cleaned data on Gravitational Waves
save_datasets('gw', train_gw, validate_gw, test_gw)

## Tokenize

Apply pre-processing filters: strip_tags, strip_punctuation, strip_multiple_whitespaces, stric_numeric, remove_stopwords; strip_short

Apply lemmatization to the list of words.

The tokenize functions are in the `tok.py` package provided in the same directory as the notebooks.

see: https://github.com/piskvorky/gensim/blob/develop/gensim/parsing/preprocessing.py

In [17]:
# make a dictionary with all the words in the complete CSCL dataset
texts_cscl = tok.clean(arxiv_cscl_merged['abstract'])
dictionary_cscl = tok.make_dictionary(texts_cscl)

# make a dictionary with all the words in the complete GW dataset
texts_gw = tok.clean(arxiv_gw_merged['abstract'])
dictionary_gw = tok.make_dictionary(texts_gw)

In [18]:
def tokenize_dataset(dictionary, df):
    _texts = tok.clean(df['abstract'])
    return(tok.make_corpus(dictionary, _texts))

def tokenize_datasets(dictionary, train, validate, test):
    corpus_train = tokenize_dataset(dictionary, train)
    corpus_validate = tokenize_dataset(dictionary, validate)
    corpus_test = tokenize_dataset(dictionary, test)
    return(corpus_train, corpus_validate, corpus_test)

In [19]:
# Tokenize data on Computation and Language
corpus_train_cscl, corpus_validate_cscl, corpus_test_cscl = tokenize_datasets(dictionary_cscl, train_cscl, validate_cscl, test_cscl)

# Tokenize data on Gravitational Waves
corpus_train_gw, corpus_validate_gw, corpus_test_gw = tokenize_datasets(dictionary_gw, train_gw, validate_gw, test_gw)

### Save tokenized data

In [20]:
# save the dictionary
with open(os.path.join(DATA_PATH, 'dictionary_cscl.pickle'), 'wb') as handle:
    pickle.dump(dictionary_cscl, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(DATA_PATH, 'dictionary_gw.pickle'), 'wb') as handle:
    pickle.dump(dictionary_gw, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
def save_tokenized_dataset(path, obj):
    with open(path, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def save_tokenized_datasets(name, train, validate, test):
    path_train = os.path.join(DATA_PATH, f"corpus_train_{name}.pickle")
    save_tokenized_dataset(path_train, train)

    path_validate = os.path.join(DATA_PATH, f"corpus_validate_{name}.pickle")
    save_tokenized_dataset(path_validate, validate)

    path_test = os.path.join(DATA_PATH, f"corpus_test_{name}.pickle")
    save_tokenized_dataset(path_test, test)

In [22]:
# save data on Computation and Language
save_tokenized_datasets('cscl', corpus_train_cscl, corpus_validate_cscl, corpus_test_cscl)

# save data on Gravitational Waves
save_tokenized_datasets('gw', corpus_train_gw, corpus_validate_gw, corpus_test_gw)