# Clean and tokenize the data
Open the data (metadata and abstracts) downloaded manually from OpenAlex (see README).
* Keop oly entries with an abstract

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
import zipfile as zf
import pickle
import os

In [21]:
DATA_PATH = 'data'  # path to folder with (downloaded or generated) data, will not be committed to git
ACRONYM = 'openalex_tng'  # Acronym, aka machine readable name, of this dataset ('tng' stands for "topic netework graph")

In [22]:
data_df = pd.read_csv(os.path.join(DATA_PATH, f"{ACRONYM}.csv"), index_col=0)

In [23]:
print(f"The downloaded data set has {data_df.shape[0]} rows and {data_df.shape[1]} columns.")

The downloaded data set has 180 rows and 173 columns.


In [12]:
# List data columns
', '.join([colname for colname in data_df.columns])

'doi, title, display_name, relevance_score, publication_year, publication_date, language, type, type_crossref, indexed_in, countries_distinct_count, institutions_distinct_count, corresponding_author_ids, corresponding_institution_ids, apc_list, apc_paid, fwci, has_fulltext, fulltext_origin, cited_by_count, is_retracted, is_paratext, locations_count, datasets, versions, referenced_works_count, referenced_works, related_works, ngrams_url, cited_by_api_url, updated_date, created_date, ids.openalex, ids.doi, ids.mag, primary_location.is_oa, primary_location.landing_page_url, primary_location.pdf_url, primary_location.source, primary_location.license, primary_location.license_id, primary_location.version, primary_location.is_accepted, primary_location.is_published, open_access.is_oa, open_access.oa_status, open_access.oa_url, open_access.any_repository_has_fulltext, citation_normalized_percentile.value, citation_normalized_percentile.is_in_top_1_percent, citation_normalized_percentile.is_in

In [13]:
data_df.head(3)

Unnamed: 0_level_0,doi,title,display_name,relevance_score,publication_year,publication_date,language,type,type_crossref,indexed_in,...,locations.source.host_organization_lineage_names,locations.source.type,sustainable_development_goals.id,sustainable_development_goals.score,sustainable_development_goals.display_name,grants.funder,grants.funder_display_name,grants.award_id,counts_by_year.year,counts_by_year.cited_by_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://openalex.org/W3015453090,https://doi.org/10.18653/v1/2020.acl-main.447,S2ORC: The Semantic Scholar Open Research Corpus,S2ORC: The Semantic Scholar Open Research Corpus,32.683903,2020,2020-01-01,en,article,proceedings-article,crossref,...,nan|Cornell University,nan|repository,,,,,,,2024|2023|2022|2021|2020,27|94|57|83|37
https://openalex.org/W2954057334,https://doi.org/10.1016/j.softx.2019.100263,pybliometrics: Scriptable bibliometrics using ...,pybliometrics: Scriptable bibliometrics using ...,25.612167,2019,2019-07-01,en,article,journal-article,crossref|doaj,...,Elsevier BV|,journal|repository,,,,,,,2024|2023|2022|2021|2020|2019|2015,22|42|28|30|10|1|1
https://openalex.org/W2906109115,https://doi.org/10.1016/j.tree.2018.11.007,Research Weaving: Visualizing the Future of Re...,Research Weaving: Visualizing the Future of Re...,25.005627,2019,2019-03-01,en,article,journal-article,crossref|pubmed,...,Elsevier BV|Australian National University|Nat...,journal|repository|repository,https://metadata.un.org/sdg/15,0.43,Life on land,,,,2024|2023|2022|2021|2020|2019|2018,27|42|36|22|21|4|1


## Keep only entries with an abstract

In [23]:
idx = data_df['abstract'].isna()
data_df = data_df[~idx]
print(f"Dropped {idx.sum()} entries without abstract.")

Dropped 1525 entries without abstract.


## Split the data into train / validate / test datasets

"train"
>    A percent of the texts reserved for fitting the model: 50%

"validate"
>    A percent of the texts reserved for computing perplexity when fitting the model's k-parameter, and searching for best parameters: 25%

"test"
>    A percent of the texts reserved for testing hypotheses: 25%

In [25]:
def split(df):
    train, test = train_test_split(df, test_size=0.5)
    validate, test = train_test_split(test, test_size=0.5)
    return(train, validate, test)

train_df, validate_df, test_df = split(data_df)
print(f"The train dataset for gw has {train_df.shape[0]} rows, the validate dataset {validate_df.shape[0]} rows, the test dataset {test_df.shape[0]} rows")

The train dataset for gw has 2758 rows, the validate dataset 1379 rows, the test dataset 1379 rows


## Save the cleaned data

In [29]:
def save_dataset(filename, path, df):
    """Writes a zipped file with a correctly named csv file inside."""
    with zf.ZipFile(path, 'w') as ziparchive:
        ziparchive.writestr(filename, df.to_csv())

def save_datasets(name, train, validate, test):
    """Saves train, test and validate dataframes as zipped CSV files."""
    filename_train = f"{name}_train.csv"
    path_train = os.path.join(DATA_PATH, f"{filename_train}.zip")
    save_dataset(filename_train, path_train, train)
    
    filename_validate = f"{name}_validate.csv"
    path_validate = os.path.join(DATA_PATH, f"{filename_validate}.zip")
    save_dataset(filename_validate, path_validate, validate)

    filename_test = f"{name}_test.csv"
    path_test = os.path.join(DATA_PATH, f"{filename_test}.zip")
    save_dataset(filename_test, path_test, test)

In [33]:
# save the cleaned data
save_datasets(ACRONYM, train_df, validate_df, test_df)