In [1]:
import pandas as pd
import os

import sys
sys.path.append('/mnt/0A2AAC152AABFBB7/CGE/paper-fetcher')
from paper_fetcher.clean import EmailDropper, FinalSymbolsDropper, ElectronicAddressDropper
from paper_fetcher.clean import AffiliationSelector, ParenthesisDropper, CountrySelector
from paper_fetcher.clean import USA, Netherlands, UnitedKingdom, China, Australia, Spain, Finland

from paper_fetcher.utils import recover_columns_names

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
FOLDER = '/mnt/0A2AAC152AABFBB7/data/PubMedPapers/'
df_all = pd.read_csv(
    os.path.join(FOLDER, 'publications_filtered_v6.csv')
)

In [3]:
df_all

Unnamed: 0,PubMed ID,Title,Publication Year,Author,Affiliations,Country
0,38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,"Department of Geriatrics, Institute of Geronto...",China.
1,38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,"Key Laboratory of Vascular Aging, Ministry of ...",China.
2,38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,"Department of Geriatrics, Institute of Geronto...",China.
3,38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,"Key Laboratory of Vascular Aging, Ministry of ...",China.
4,38853292,"Physical frailty, genetic predisposition, and ...",2024,Jiajun Li,Division of Cardiothoracic and Vascular Surger...,China.
...,...,...,...,...,...,...
187548,12848262,Polymorphic sequence variants in medicine: a c...,2003,Shirley V Hodgson,"St George's Hospital Medical School, and St Ge...",London. s.hodgson@sghms.ac.uk
187549,12767753,UK Biobank: a project in search of a protocol?,2003,Virginia Barbour,"The Lancet, 32 Jamestown Road, NW1 7BY, London...",UK. virginia.barbour@lancet.com <virginia.barb...
187550,12693011,"[Ethical, legal, and social issues of genome r...",2003,Tohru Masui,"Cellbank (JCRB), Department of Genetics and Mu...",Japan. masui@nihs.go.jp
187551,12049178,Gene-environment interactions--the BioBank UK ...,2002,A F Wright,"MRC Human Genetics Unit, Western General Hospi...",UK. alan.wright@hgu.mrc.ac.uk


In [4]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187553 entries, 0 to 187552
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   PubMed ID         187553 non-null  int64 
 1   Title             187171 non-null  object
 2   Publication Year  187553 non-null  int64 
 3   Author            187553 non-null  object
 4   Affiliations      187553 non-null  object
 5   Country           187553 non-null  object
dtypes: int64(2), object(4)
memory usage: 8.6+ MB


In [5]:
df_all["Affiliations"].unique()

array(['Department of Geriatrics, Institute of Gerontology, Tongji Hospital of Tongji Medical College, Huazhong University of Science and Technology, Wuhan, China.',
       'Key Laboratory of Vascular Aging, Ministry of Education, Tongji Hospital of Tongji Medical College, Huazhong University of Science and Technology, Wuhan, China.',
       'Division of Cardiothoracic and Vascular Surgery, Tongji Hospital, Tongji Medical College, Huazhong University of Science and Technology, Wuhan, China.',
       ...,
       'Cellbank (JCRB), Department of Genetics and Mutagenesis, National Institute of Health Sciences, 1-18-1, Kami-yoga, Setagaya-ku, Tokyo 158-8501, Japan. masui@nihs.go.jp',
       'MRC Human Genetics Unit, Western General Hospital, Edinburgh, UK. alan.wright@hgu.mrc.ac.uk',
       'Department of Pathological Biochemistry, Royal Infirmary, University/NHS Trust, Glasgow, UK.'],
      dtype=object)

In [6]:
affi_pipe = Pipeline([
    ('email', EmailDropper().set_output(transform="pandas")),
    ('last_sym', FinalSymbolsDropper().set_output(transform="pandas")),
    ('eadd_drop', ElectronicAddressDropper().set_output(transform="pandas")),
    ('affi_sel', AffiliationSelector().set_output(transform="pandas")),
    ('par_drop', ParenthesisDropper().set_output(transform="pandas")),
    ('last_sym1', FinalSymbolsDropper().set_output(transform="pandas")),
    ('country', CountrySelector().set_output(transform="pandas"))
])

aff_trans = ColumnTransformer([
    ('affi_pipe', affi_pipe, ["Affiliations"])
],
remainder="passthrough").set_output(transform="pandas")



In [7]:
df_1 = aff_trans.fit_transform(df_all)
df_1.columns = recover_columns_names(df_1.columns)
df_1

Unnamed: 0,Affiliations,country,PubMed ID,Title,Publication Year,Author,Country
0,"Department of Geriatrics, Institute of Geronto...",China,38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
1,"Key Laboratory of Vascular Aging, Ministry of ...",China,38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
2,"Department of Geriatrics, Institute of Geronto...",China,38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
3,"Key Laboratory of Vascular Aging, Ministry of ...",China,38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
4,Division of Cardiothoracic and Vascular Surger...,China,38853292,"Physical frailty, genetic predisposition, and ...",2024,Jiajun Li,China.
...,...,...,...,...,...,...,...
187548,"St George's Hospital Medical School, and St Ge...",London,12848262,Polymorphic sequence variants in medicine: a c...,2003,Shirley V Hodgson,London. s.hodgson@sghms.ac.uk
187549,"The Lancet, 32 Jamestown Road, NW1 7BY, London...",UK,12767753,UK Biobank: a project in search of a protocol?,2003,Virginia Barbour,UK. virginia.barbour@lancet.com <virginia.barb...
187550,"Cellbank , Department of Genetics and Mutagene...",Japan,12693011,"[Ethical, legal, and social issues of genome r...",2003,Tohru Masui,Japan. masui@nihs.go.jp
187551,"MRC Human Genetics Unit, Western General Hospi...",UK,12049178,Gene-environment interactions--the BioBank UK ...,2002,A F Wright,UK. alan.wright@hgu.mrc.ac.uk


In [8]:
count_pipe = Pipeline([
    ('usa', USA().set_output(transform="pandas")),
    ('net', Netherlands().set_output(transform="pandas")),
    ('uk', UnitedKingdom().set_output(transform="pandas")),
    ('china', China().set_output(transform="pandas")),
    ('aus', Australia().set_output(transform="pandas")),
    ('fin', Finland().set_output(transform="pandas")),
    ('esp', Spain().set_output(transform="pandas")),
])

count_trans = ColumnTransformer([
    ('count_pipe', count_pipe, ["country"])
],
remainder="passthrough").set_output(transform="pandas")

In [9]:
df_2 = count_trans.fit_transform(df_1)
df_2.columns = recover_columns_names(df_2.columns)
df_2

Unnamed: 0,country,Affiliations,PubMed ID,Title,Publication Year,Author,Country
0,China,"Department of Geriatrics, Institute of Geronto...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
1,China,"Key Laboratory of Vascular Aging, Ministry of ...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
2,China,"Department of Geriatrics, Institute of Geronto...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
3,China,"Key Laboratory of Vascular Aging, Ministry of ...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
4,China,Division of Cardiothoracic and Vascular Surger...,38853292,"Physical frailty, genetic predisposition, and ...",2024,Jiajun Li,China.
...,...,...,...,...,...,...,...
187548,London,"St George's Hospital Medical School, and St Ge...",12848262,Polymorphic sequence variants in medicine: a c...,2003,Shirley V Hodgson,London. s.hodgson@sghms.ac.uk
187549,UK,"The Lancet, 32 Jamestown Road, NW1 7BY, London...",12767753,UK Biobank: a project in search of a protocol?,2003,Virginia Barbour,UK. virginia.barbour@lancet.com <virginia.barb...
187550,Japan,"Cellbank , Department of Genetics and Mutagene...",12693011,"[Ethical, legal, and social issues of genome r...",2003,Tohru Masui,Japan. masui@nihs.go.jp
187551,UK,"MRC Human Genetics Unit, Western General Hospi...",12049178,Gene-environment interactions--the BioBank UK ...,2002,A F Wright,UK. alan.wright@hgu.mrc.ac.uk


In [10]:
df_2.to_csv(os.path.join(FOLDER, 'publications_cleaned.csv'))

In [11]:
df_2['country'].unique().tolist()

['China',
 'UK',
 'Sweden',
 'Chile',
 'Australia',
 'USA',
 'Italy',
 'Germany',
 'Ireland',
 'Greece',
 'Czech Republic',
 'Spain',
 'France',
 'Netherlands',
 'Republic of Korea',
 'Belgium',
 'Finland',
 'Canada',
 'Denmark',
 'Iceland',
 'Austria',
 'Estonia',
 'Cyprus',
 'No',
 'Switzerland',
 'BC',
 'South Africa',
 'Uganda',
 'P',
 'Burkina Faso',
 'Ghana',
 'Kenya',
 'Japan',
 'Indonesia',
 'Hungary',
 'Norway',
 'México',
 'Brazil',
 'Argentina',
 'Colombia',
 'Dominican Republic',
 'Mexico',
 'Peru',
 'Uruguay',
 'Costa Rica',
 '251 Bayview Blvd',
 'Cambridge UK',
 'School of Traditional Chinese Medicine and School of Informatics are two departments of Hunan University of Chinese Medicine',
 'The Second People&apos',
 'Iraq',
 'Fudan University',
 'Department of Public Health & Primary Care University of Cambridge Cambridge UK',
 'Macedonia',
 'Singapore',
 'Philippines',
 'MD',
 'United Arab Emirates',
 'New Zealand',
 'Taiwan',
 'Thailand',
 'Lithuania',
 'Israel',
 'Bahra