# Clean and tokenize
The dataset obtained from OpenAlex (see README.md) was reviewed using on HubMeta. 
* Load the metadata of the selected 682 papers as a data frame
* Split data into train/validate/test
* Aplly pre-processing filters
* Apply lemmatization
* Save tokenized corpus

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tok
import zipfile as zf
import pickle
import os
from RISparser import readris

In [3]:
DATA_PATH = 'data'

## Load the metadata of the selected 682 papers as a data frame

In [20]:

filepath = os.path.join(DATA_PATH, 'export_paper_2024-10-29_17-23-53.ris')
with open(filepath, 'r') as bibliography_file:
    entries = readris(bibliography_file)
bib_df_sdp = pd.DataFrame(list(entries))

## Split the data into train / validate / test datasets

"train"

    A percent of the texts reserved for fitting the model.

"validate"

    A percent of the texts reserved for computing perplexity when fitting the model's k-parameter, and searching for best parameters.

"test"

    A percent of the texts reserved for testing hypotheses.


In [19]:
def split(df):
    train, test = train_test_split(df, test_size=0.5)
    validate, test = train_test_split(test, test_size=0.5)
    return(train, validate, test)

In [21]:
# split papers on Scholarly Document Processing (sdp)
train_sdp, validate_sdp, test_sdp = split(bib_df_sdp)
print(f"The train dataset for SDP has {train_sdp.shape[0]} rows, the validate dataset {validate_sdp.shape[0]} rows, the test dataset {test_sdp.shape[0]} rows")

The train dataset for SDP has 341 rows, the validate dataset 170 rows, the test dataset 171 rows


## Save cleaned data

In [22]:
def save_dataset(filename, path, df):
    with zf.ZipFile(path, 'w') as ziparchive:
        ziparchive.writestr(filename, df.to_csv())

def save_datasets(name, train, validate, test):
    filename_train = f"{name}_train.csv"
    path_train = os.path.join(DATA_PATH, f"{filename_train}.zip")
    save_dataset(filename_train, path_train, train)
    
    filename_validate = f"{name}_validate.csv"
    path_validate = os.path.join(DATA_PATH, f"{filename_validate}.zip")
    save_dataset(filename_validate, path_validate, validate)

    filename_test = f"{name}_test.csv"
    path_test = os.path.join(DATA_PATH, f"{filename_test}.zip")
    save_dataset(filename_test, path_test, test)

In [23]:
# save the cleaned data on Scholarly Document Processing
save_datasets('sdp', train_sdp, validate_sdp, test_sdp)

## Tokenize

Apply pre-processing filters: strip_tags, strip_punctuation, strip_multiple_whitespaces, stric_numeric, remove_stopwords; strip_short

Apply lemmatization to the list of words.

The tokenize functions are in the tok.py package provided in the same directory as the notebooks.

see: https://github.com/piskvorky/gensim/blob/develop/gensim/parsing/preprocessing.py


In [25]:
# make a dictionary with all the words in the complete SDP dataset
texts_sdp = tok.clean(bib_df_sdp['abstract'])
dictionary_sdp = tok.make_dictionary(texts_sdp)

In [27]:
def tokenize_dataset(dictionary, df):
    _texts = tok.clean(df['abstract'])
    return(tok.make_corpus(dictionary, _texts))

def tokenize_datasets(dictionary, train, validate, test):
    corpus_train = tokenize_dataset(dictionary, train)
    corpus_validate = tokenize_dataset(dictionary, validate)
    corpus_test = tokenize_dataset(dictionary, test)
    return(corpus_train, corpus_validate, corpus_test)

In [28]:
# Tokenize data on Scholarly Document Processing
corpus_train_sdp, corpus_validate_sdp, corpus_test_sdp = tokenize_datasets(dictionary_sdp, train_sdp, validate_sdp, test_sdp)

## Save tokenized data

In [29]:
# save the dictionary
with open(os.path.join(DATA_PATH, 'dictionary_sdp.pickle'), 'wb') as handle:
    pickle.dump(dictionary_sdp, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [30]:
def save_tokenized_dataset(path, obj):
    with open(path, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def save_tokenized_datasets(name, train, validate, test):
    path_train = os.path.join(DATA_PATH, f"corpus_train_{name}.pickle")
    save_tokenized_dataset(path_train, train)

    path_validate = os.path.join(DATA_PATH, f"corpus_validate_{name}.pickle")
    save_tokenized_dataset(path_validate, validate)

    path_test = os.path.join(DATA_PATH, f"corpus_test_{name}.pickle")
    save_tokenized_dataset(path_test, test)

In [31]:
# save data on Scholarly Document Processing
save_tokenized_datasets('sdp', corpus_train_sdp, corpus_validate_sdp, corpus_test_sdp)