# Clean and tokenize the data
Open the data (metadata and abstracts) downloaded manually from OpenAlex (see README).
* Keop oly entries with an abstract

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
import zipfile as zf
import pickle
import os

In [31]:
DATA_PATH = 'data'  # path to folder with (downloaded or generated) data, will not be committed to git
ACRONYM = 'openalex_tng'  # Acronym, aka machine readable name, of this dataset ('tng' stands for "topic netework graph")

In [20]:
data_df = pd.read_csv(os.path.join(DATA_PATH, f"{ACRONYM}.csv"), index_col=0)

In [21]:
print(f"The downloaded data set has {data_df.shape[0]} rows and {data_df.shape[1]} columns.")

The downloaded data set has 7041 rows and 174 columns.


In [22]:
# List data columns
', '.join([colname for colname in data_df.columns])

'doi, title, display_name, relevance_score, publication_year, publication_date, language, type, type_crossref, indexed_in, countries_distinct_count, institutions_distinct_count, corresponding_author_ids, corresponding_institution_ids, fwci, has_fulltext, fulltext_origin, cited_by_count, is_retracted, is_paratext, locations_count, datasets, versions, referenced_works_count, referenced_works, related_works, ngrams_url, cited_by_api_url, updated_date, created_date, ids.openalex, ids.doi, ids.mag, ids.pmid, ids.pmcid, primary_location.is_oa, primary_location.landing_page_url, primary_location.pdf_url, primary_location.source.id, primary_location.source.display_name, primary_location.source.issn_l, primary_location.source.issn, primary_location.source.is_oa, primary_location.source.is_in_doaj, primary_location.source.is_core, primary_location.source.host_organization, primary_location.source.host_organization_name, primary_location.source.host_organization_lineage, primary_location.source.h

In [26]:
data_df.head(3)

Unnamed: 0_level_0,doi,title,display_name,relevance_score,publication_year,publication_date,language,type,type_crossref,indexed_in,...,locations.source.type,locations.source,sustainable_development_goals.id,sustainable_development_goals.display_name,sustainable_development_goals.score,grants.funder,grants.funder_display_name,grants.award_id,counts_by_year.year,counts_by_year.cited_by_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://openalex.org/W2150220236,https://doi.org/10.1007/s11192-009-0146-3,"Software survey: VOSviewer, a computer program...","Software survey: VOSviewer, a computer program...",133.38614,2009,2009-12-31,en,article,journal-article,crossref|pubmed,...,journal|repository|repository|repository|repos...,nan|nan|nan|nan|nan|nan|nan,,,,,,,2024|2023|2022|2021|2020|2019|2018|2017|2016|2...,1879|2959|2336|1561|908|540|322|151|123|94|75|...
https://openalex.org/W2138621811,https://doi.org/10.1145/324133.324140,Authoritative sources in a hyperlinked environ...,Authoritative sources in a hyperlinked environ...,129.95451,1999,1999-09-01,en,article,journal-article,crossref,...,journal|repository,nan|nan,https://metadata.un.org/sdg/15,Life on land,0.43,,,,2024|2023|2022|2021|2020|2019|2018|2017|2016|2...,95|233|207|331|353|351|359|344|449|457|502|502...
https://openalex.org/W2148606196,https://doi.org/10.1137/s003614450342480,The Structure and Function of Complex Networks,The Structure and Function of Complex Networks,95.296745,2003,2003-01-01,en,article,journal-article,arxiv|crossref|datacite,...,journal|repository|metadata,nan|nan|nan,,,,,,,2024|2023|2022|2021|2020|2019|2018|2017|2016|2...,332|561|716|817|821|940|930|977|953|1074|1082|...


## Keep only entries with an abstract

In [23]:
idx = data_df['abstract'].isna()
data_df = data_df[~idx]
print(f"Dropped {idx.sum()} entries without abstract.")

Dropped 1525 entries without abstract.


## Split the data into train / validate / test datasets

"train"
>    A percent of the texts reserved for fitting the model: 50%

"validate"
>    A percent of the texts reserved for computing perplexity when fitting the model's k-parameter, and searching for best parameters: 25%

"test"
>    A percent of the texts reserved for testing hypotheses: 25%

In [25]:
def split(df):
    train, test = train_test_split(df, test_size=0.5)
    validate, test = train_test_split(test, test_size=0.5)
    return(train, validate, test)

train_df, validate_df, test_df = split(data_df)
print(f"The train dataset for gw has {train_df.shape[0]} rows, the validate dataset {validate_df.shape[0]} rows, the test dataset {test_df.shape[0]} rows")

The train dataset for gw has 2758 rows, the validate dataset 1379 rows, the test dataset 1379 rows


## Save the cleaned data

In [29]:
def save_dataset(filename, path, df):
    """Writes a zipped file with a correctly named csv file inside."""
    with zf.ZipFile(path, 'w') as ziparchive:
        ziparchive.writestr(filename, df.to_csv())

def save_datasets(name, train, validate, test):
    """Saves train, test and validate dataframes as zipped CSV files."""
    filename_train = f"{name}_train.csv"
    path_train = os.path.join(DATA_PATH, f"{filename_train}.zip")
    save_dataset(filename_train, path_train, train)
    
    filename_validate = f"{name}_validate.csv"
    path_validate = os.path.join(DATA_PATH, f"{filename_validate}.zip")
    save_dataset(filename_validate, path_validate, validate)

    filename_test = f"{name}_test.csv"
    path_test = os.path.join(DATA_PATH, f"{filename_test}.zip")
    save_dataset(filename_test, path_test, test)

In [33]:
# save the cleaned data
save_datasets(ACRONYM, train_df, validate_df, test_df)