# Clean Data

In [None]:
import numpy as np
import pandas as pd
import pyarrow.dataset as ds
import glob

In [31]:
works_updated = '../Articles/OA-WorksUpdated/works_updated.parquet'
files_wa = glob.glob(works_updated + '/*.parquet')
files_wa = files_wa[:2]
dataframes_wa = []
for file in files_wa:
    #print(f"Reading file: {file}")
    df = pd.read_parquet(file)
    dataframes_wa.append(df)
meta = pd.concat(dataframes_wa, ignore_index = True)

In [32]:
meta.columns

Index(['id', 'doi', 'display_name', 'title', 'publication_year',
       'publication_date', 'ids', 'host_venue', 'primary_location',
       'best_oa_location', 'type', 'open_access', 'authorships',
       'cited_by_count', 'biblio', 'is_retracted', 'is_paratext', 'concepts',
       'mesh', 'alternate_host_venues', 'locations', 'referenced_works',
       'related_works', 'counts_by_year', 'cited_by_api_url', 'updated_date',
       'created_date', 'abstract_inverted_index', 'pmid'],
      dtype='object')

In [33]:
def has_pmid(ids_list):
    return any(key == 'pmid' for key, _ in ids_list)

# Get the 5 most important concepts from each paper
def extract_display_names(concepts_array, limit = 5):
    return [concept['display_name'] for concept in concepts_array]

# Save the first author for all papers
def extract_author_names(author_array):
    author_names = [authorship['author']['display_name'] for authorship in author_array if authorship['author_position'] in ['first']]
    return author_names

# Extract the PMID from the url
def extract_pmid_number(ids_list):
    pmid_url = next((url for key, url in ids_list if key =='pmid'), None)
    if pmid_url and 'pubmed.ncbi.nlm.nih.gov/' in pmid_url:
        return pmid_url.split('/')[-1]
    return None

meta = meta[meta['ids'].apply(has_pmid)]
meta = meta[['title', 'abstract_inverted_index', 'publication_date', 'ids', 'concepts', 'authorships', 'cited_by_count', 'related_works', 'cited_by_api_url']]
meta['ids'] = meta['ids'].apply(extract_pmid_number)
meta['concepts'] = meta['concepts'].apply(extract_display_names)
meta['authorships'] = meta['authorships'].apply(extract_author_names)

In [34]:
meta = meta.rename(columns={
    "abstract_inverted_index": "abstract",
    "ids": 'pmid',
    "authorships": "author",
    "cited_by_count": "num_citations"
})
meta.head(5)

Unnamed: 0,title,abstract,publication_date,pmid,concepts,author,num_citations,related_works,cited_by_api_url
0,Treatment of Alcohol Withdrawal Syndrome,Treatment of the alcohol withdrawal syndrome i...,1994-01-01,7912939,"[Alcohol withdrawal syndrome, Medicine, Rehabi...",[Vural Özdemir],18,"[https://openalex.org/W4388336948, https://ope...",https://api.openalex.org/works?filter=cites:W2...
4,Tactile Cutaneous Representation in Cerebellar...,Recent studies of the albino rat revealed exte...,1985-01-01,3843070,"[Opossum, Anatomy, Somatosensory system, Forel...",[Wally Welker],18,"[https://openalex.org/W102752321, https://open...",https://api.openalex.org/works?filter=cites:W2...
7,Lipocalin-2 is Associated With a Good Prognosi...,,2013-03-29,23539193,"[Lipocalin, Pancreatic cancer, Epithelial–mese...",[Bin Xu],18,"[https://openalex.org/W2784579557, https://ope...",https://api.openalex.org/works?filter=cites:W2...
8,Pulmonary embolism in an infant,A case of fatal pulmonary embolism in a 13-mon...,1962-11-01,13951639,"[Medicine, Thrombophlebitis, Pulmonary embolis...",[Seth L. Haber],18,"[https://openalex.org/W2079604409, https://ope...",https://api.openalex.org/works?filter=cites:W2...
12,"Nutritional and other types of oedema, albumin...",The various types of oedema in man are conside...,2015-04-06,25844980,"[Kwashiorkor, Medicine, Starling, Glycocalyx, ...",[Michael Golden],18,"[https://openalex.org/W2994001373, https://ope...",https://api.openalex.org/works?filter=cites:W2...


In [35]:
len(meta)

185275

In [36]:
meta.sort_values(by='num_citations', ascending = False, inplace = True)

In [37]:
len(meta[meta['num_citations'] == 18])

13362

In [10]:
meta.to_parquet('compressed_fulldata.parquet', engine='pyarrow')

In [38]:
meta['author'] = meta['author'].apply(lambda x: x[0] if x else None)


In [39]:
meta.iloc[0]['author']

'Vural Özdemir'

In [40]:
author_counts = meta.groupby('author').size().sort_values(ascending=False)
top_authors = author_counts.head(10)

In [41]:
print(top_authors)

author
Wei Li                106
A.M. Api               84
Yong Li                46
Rui Zhang              38
Wei Chen               33
Steven A. Kaplan       26
Xi Chen                25
Wei Zhang              22
Bin Zhang              21
Michiyuki Kawakami     20
dtype: int64


In [45]:
filter_df = meta[meta['author'] =='Wei Chen']
sorted_df = filter_df.sort_values(by='publication_date', ascending = False)
recent_papers = sorted_df.head(5)
recent_papers

Unnamed: 0,title,abstract,publication_date,pmid,concepts,author,num_citations,related_works,cited_by_api_url
208622,Enhancing Hit Discovery in Virtual Screening t...,In the hit identification stage of drug discov...,2023-05-11,37167486,"[Free energy perturbation, Virtual screening, ...",Wei Chen,15,"[https://openalex.org/W4380716331, https://ope...",https://api.openalex.org/works?filter=cites:W4...
353286,Time diffraction-free transverse orbital angul...,The discovery of optical transverse orbital an...,2022-07-11,35821372,"[Physics, Angular momentum, Diffraction, Optic...",Wei Chen,13,"[https://openalex.org/W2102517274, https://ope...",https://api.openalex.org/works?filter=cites:W4...
1108451,Macrodactyly of the foot resulting from planta...,The role of the plantar nerve in the pathogene...,2021-08-01,33384233,"[Medicine, Epineurium, Endoneurium, Plantar fa...",Wei Chen,0,"[https://openalex.org/W2087484234, https://ope...",https://api.openalex.org/works?filter=cites:W3...
349288,The interactions between chiral analytes and c...,The goal of the present study was to disclose ...,2021-08-01,34090134,"[Chemistry, Analyte, Hydrogen bond, Chitosan, ...",Wei Chen,13,"[https://openalex.org/W2149581068, https://ope...",https://api.openalex.org/works?filter=cites:W3...
204618,CircRNA Circ-ITCH Inhibits the Proliferation a...,"Circ-ITCH, a novel circRNA, was generated from...",2021-01-01,33571015,"[Glioma, Downregulation and upregulation, Canc...",Wei Chen,15,"[https://openalex.org/W2054919679, https://ope...",https://api.openalex.org/works?filter=cites:W3...
