# Clean and tokenize the data
Open the data (metadata and abstracts) downloaded manually from OpenAlex (see README).
* Keop oly entries with an abstract

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import zipfile as zf
import pickle
import os

In [2]:
DATA_PATH = 'data'  # path to folder with (downloaded or generated) data, will not be committed to git
ACRONYM = 'openalex_tng'  # Acronym, aka machine readable name, of this dataset ('tng' stands for "topic netework graph")

In [3]:
data_df = pd.read_csv(os.path.join(DATA_PATH, f"{ACRONYM}.csv"), index_col=0)

In [4]:
print(f"The downloaded data set has {data_df.shape[0]} rows and {data_df.shape[1]} columns.")

The downloaded data set has 97 rows and 171 columns.


In [5]:
# List data columns
', '.join([colname for colname in data_df.columns])

'doi, title, display_name, relevance_score, publication_year, publication_date, language, type, type_crossref, indexed_in, countries_distinct_count, institutions_distinct_count, corresponding_author_ids, corresponding_institution_ids, apc_list, apc_paid, fwci, has_fulltext, fulltext_origin, cited_by_count, is_retracted, is_paratext, locations_count, datasets, versions, referenced_works_count, referenced_works, related_works, cited_by_api_url, updated_date, created_date, ids.openalex, ids.doi, ids.mag, ids.pmid, ids.pmcid, primary_location.is_oa, primary_location.landing_page_url, primary_location.pdf_url, primary_location.source.id, primary_location.source.display_name, primary_location.source.issn_l, primary_location.source.issn, primary_location.source.is_oa, primary_location.source.is_in_doaj, primary_location.source.is_core, primary_location.source.host_organization, primary_location.source.host_organization_name, primary_location.source.host_organization_lineage, primary_location.

## Data cleanup
* Keep only columns used in further analysis (USED_COLS)
* Filter out rows with missing data in the required columns (REQUIRED_COLS)
* Remove duplicates

In [6]:
# columns used in the analysis
USED_COLS = [
    'authorships.author.display_name', 'publication_year', 'title', 'primary_location.source.display_name','doi', 
    'has_fulltext', 'is_retracted', 'primary_location.is_accepted', 'primary_location.is_published', 
    'abstract', 'keywords.display_name', 
    'citation_normalized_percentile.value', 'open_access.is_oa', 'authorships.countries']
# columns required to have a value
REQUIRED_COLS = [val for val in USED_COLS if val not in ['doi', 'open_access.is_oa', 'authorships.countries']]

In [7]:
# Keep only columns used in further analysis (USED_COLS)
clean_df = data_df[USED_COLS]

In [8]:
# Filter out rows with missing data in the required columns (REQUIRED_COLS)
for col in REQUIRED_COLS:
    idx = clean_df[col].isna()
    clean_df = clean_df[~idx]
    print(f"Dropped {idx.sum()} entries without {col}.")

Dropped 0 entries without authorships.author.display_name.
Dropped 0 entries without publication_year.
Dropped 0 entries without title.
Dropped 7 entries without primary_location.source.display_name.
Dropped 0 entries without has_fulltext.
Dropped 0 entries without is_retracted.
Dropped 0 entries without primary_location.is_accepted.
Dropped 0 entries without primary_location.is_published.
Dropped 12 entries without abstract.
Dropped 0 entries without keywords.display_name.
Dropped 0 entries without citation_normalized_percentile.value.


## Handle duplicates

In [9]:
# remove duplicated rows
print(f"Found {clean_df.duplicated().sum()} duplicated rows")
clean_df.drop_duplicates(inplace=True)

Found 1 duplicated rows


## Remove retracted and not reviewd articles

In [10]:
# remove retracted articles
idx = clean_df['is_retracted']
clean_df = clean_df[~idx]
print(f"Count of articles that have been retacted: {idx.sum()}")

Count of articles that have been retacted: 0


In [11]:
# remove non-published or accepted articles
idx = ~(clean_df['primary_location.is_accepted']) & ~(clean_df['primary_location.is_published'])
clean_df = clean_df[~idx]
print(f"Count of articles that have not been published or accepted: {idx.sum()}")

Count of articles that have not been published or accepted: 12


In [12]:
clean_df.shape

(65, 14)

## Feature engineering
One-hot encode keywords and countries.

In [13]:
# Keywords are provided as '|'-separated strings. Apply one-hot encoding:  
one_hot_keywords = clean_df['keywords.display_name'].str.get_dummies()
clean_df = clean_df.join(one_hot_keywords)
print(f"Added {one_hot_keywords.shape[1]} columns for keywords")

Added 117 columns for keywords


In [14]:
one_hot_keywords.shape

(65, 117)

In [15]:
# Countries are provided as '|'-separated strings. Apply one-hot encoding:  
one_hot_countries = clean_df['authorships.countries'].str.get_dummies()
clean_df = clean_df.join(one_hot_countries)
print(f"Added {one_hot_countries.shape[1]} columns for countries")

Added 34 columns for countries


In [16]:
print(f"The dataset now has {clean_df.shape[0]} rows and {clean_df.shape[1]} columns.")

The dataset now has 65 rows and 165 columns.


## Save the cleaned data

In [17]:
def save_dataset(filename, path, df):
    """Writes a zipped file with a correctly named csv file inside."""
    with zf.ZipFile(path, 'w') as ziparchive:
        ziparchive.writestr(filename, df.to_csv())

In [18]:
csvname = f"{ACRONYM}_clean.csv"
path = os.path.join(DATA_PATH, f"{csvname}.zip")
save_dataset(csvname, path, clean_df)