In [14]:
import pandas as pd
import os

import sys
sys.path.append('/mnt/0A2AAC152AABFBB7/CGE/paper-fetcher')

from paper_fetcher.affi_cleaner import ChangeUnknown, RemoveTwitter, EmailDropper, ParenthesisDropper, FinalSymbolsDropper
from paper_fetcher.affi_cleaner import FirstNonLetter, ORCIDropper, EndingReplacer, StatesFixerCA, StatesFixerUS, Initials
from paper_fetcher.affi_cleaner import Brackets, ElectronicAddressDropper, WhiteSpaces, WebAddresses, AffiliationSelector
from paper_fetcher.affi_cleaner import ListStrReplace, NumbersDropper, ExcedingWhite, SecondAffiSelector, CountrySelector

from paper_fetcher.utils import recover_columns_names

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [15]:
FOLDER = '/mnt/0A2AAC152AABFBB7/data/PubMedPapers/'

df_all = pd.read_csv(
    os.path.join(FOLDER, 'publications_filtered_v6.csv')
)

df_all["Affiliations"] = df_all["Affiliations"].astype(str)
df_all

Unnamed: 0,PubMed ID,Title,Publication Year,Author,Affiliations,Country
0,38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,"Department of Geriatrics, Institute of Geronto...",China.
1,38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,"Key Laboratory of Vascular Aging, Ministry of ...",China.
2,38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,"Department of Geriatrics, Institute of Geronto...",China.
3,38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,"Key Laboratory of Vascular Aging, Ministry of ...",China.
4,38853292,"Physical frailty, genetic predisposition, and ...",2024,Jiajun Li,Division of Cardiothoracic and Vascular Surger...,China.
...,...,...,...,...,...,...
187548,12848262,Polymorphic sequence variants in medicine: a c...,2003,Shirley V Hodgson,"St George's Hospital Medical School, and St Ge...",London. s.hodgson@sghms.ac.uk
187549,12767753,UK Biobank: a project in search of a protocol?,2003,Virginia Barbour,"The Lancet, 32 Jamestown Road, NW1 7BY, London...",UK. virginia.barbour@lancet.com <virginia.barb...
187550,12693011,"[Ethical, legal, and social issues of genome r...",2003,Tohru Masui,"Cellbank (JCRB), Department of Genetics and Mu...",Japan. masui@nihs.go.jp
187551,12049178,Gene-environment interactions--the BioBank UK ...,2002,A F Wright,"MRC Human Genetics Unit, Western General Hospi...",UK. alan.wright@hgu.mrc.ac.uk


In [16]:
affi_pipe = Pipeline([
    ('fst', FirstNonLetter().set_output(transform="pandas")),
    ('brck', Brackets().set_output(transform="pandas")),
    ('email', EmailDropper().set_output(transform="pandas")),
    ('elec', ElectronicAddressDropper().set_output(transform="pandas")),
    ('wsp', WhiteSpaces().set_output(transform="pandas")),
    ('spcomma1', EndingReplacer(old=" ,", new=",").set_output(transform="pandas")),
    ('spsem', EndingReplacer(old=' ;', new=';').set_output(transform="pandas")),
    ('spdot1', EndingReplacer(old=' .', new='.').set_output(transform="pandas")),
    ('dotcom', EndingReplacer(old='.,', new=',').set_output(transform="pandas")),
    ('twit', RemoveTwitter().set_output(transform="pandas")),
    ('orcid', ORCIDropper().set_output(transform="pandas")),
    ('last_sym', FinalSymbolsDropper().set_output(transform="pandas")),
    ('par_drop', ParenthesisDropper().set_output(transform="pandas")),
    ('last_sym1', FinalSymbolsDropper().set_output(transform="pandas")),
    ('us', StatesFixerUS().set_output(transform="pandas")),
    ('ca', StatesFixerCA().set_output(transform="pandas")),
    ('init', Initials().set_output(transform="pandas")),
    ('spdot2', EndingReplacer(old=' .', new='.').set_output(transform="pandas")),
    ('spcomma2', EndingReplacer(old=" ,", new=",").set_output(transform="pandas")),
    ('web', WebAddresses().set_output(transform="pandas")),
    ('final', FinalSymbolsDropper().set_output(transform="pandas")),
    ('sel', AffiliationSelector().set_output(transform="pandas"))
])

aff_trans = ColumnTransformer([
    ('affi_pipe', affi_pipe, ["Affiliations"])
],
remainder="passthrough").set_output(transform="pandas")

In [17]:
df_1 = aff_trans.fit_transform(df_all)
df_1.columns = recover_columns_names(df_1.columns)
df_1

Unnamed: 0,Affiliations,first_affiliation,PubMed ID,Title,Publication Year,Author,Country
0,"Department of Geriatrics, Institute of Geronto...","Department of Geriatrics, Institute of Geronto...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
1,"Key Laboratory of Vascular Aging, Ministry of ...","Key Laboratory of Vascular Aging, Ministry of ...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
2,"Department of Geriatrics, Institute of Geronto...","Department of Geriatrics, Institute of Geronto...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
3,"Key Laboratory of Vascular Aging, Ministry of ...","Key Laboratory of Vascular Aging, Ministry of ...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
4,Division of Cardiothoracic and Vascular Surger...,Division of Cardiothoracic and Vascular Surger...,38853292,"Physical frailty, genetic predisposition, and ...",2024,Jiajun Li,China.
...,...,...,...,...,...,...,...
187548,"St George's Hospital Medical School, and St Ge...","St George's Hospital Medical School, and St Ge...",12848262,Polymorphic sequence variants in medicine: a c...,2003,Shirley V Hodgson,London. s.hodgson@sghms.ac.uk
187549,"The Lancet, 32 Jamestown Road, NW1 7BY, London...","The Lancet, 32 Jamestown Road, NW1 7BY, London...",12767753,UK Biobank: a project in search of a protocol?,2003,Virginia Barbour,UK. virginia.barbour@lancet.com <virginia.barb...
187550,"Cellbank, Department of Genetics and Mutagenes...","Cellbank, Department of Genetics and Mutagenes...",12693011,"[Ethical, legal, and social issues of genome r...",2003,Tohru Masui,Japan. masui@nihs.go.jp
187551,"MRC Human Genetics Unit, Western General Hospi...","MRC Human Genetics Unit, Western General Hospi...",12049178,Gene-environment interactions--the BioBank UK ...,2002,A F Wright,UK. alan.wright@hgu.mrc.ac.uk


In [18]:
old_shorts = [
    "Abt.", "Fabeckstr.", "Hardenbergstr.", "Dpto.", "AKh.", "St.", "Co.", "Av.", "Univ.", "Dept.", "Inc.", "No.",
    "Th.", "ul.", "Clin.", "Fac.", "Str.", "Dr.", "Sec.", "no.", "Lab.", "str.", "Dep.", "pr.", "Kgs.", "Blvd.", 
    "Rd.", "El.", "Sdr.", "Card.", " u.", " na.", "Mt.", "Ave.", "krt.", "Rkp.", "rkp.", "Al.", "dr.", "Pol.",
    "Dev.", "Bio.", "Gral.", "Depto.", "Jr.", "Dpt.", "Avda.", "ibs.", "Depto.", "Dir.", "Ctra. de", "Wm.", "Pr.",
    "Mfg.", "Sct.", "Ul.", "Dm.", "Kh.", "ave.", "Cra.", "Reg.", "Res.", "gt.", "Box.", "Ltd.", "Ss.", " per.",
    "Pvt.", "Vas.", "opg.", "Prof.", "Ac.", "Ntra.", "Sra.", "e. V"
]
new_shorts = [
    "Abt", "Fabeckstr", "Hardenbergstr", "Dpto", "AKh", "St", "Co", "Av", "Univ", "Dept", "Inc", "No",
    "Th", "ul", "Clin", "Fac", "Str", "Dr", "Sec", "no", "Lab", "str", "Dep", "pr", "Kgs", "Blvd", 
    "Rd", "El", "Sdr", "Card", " u", " na", "Mt", "Ave", "krt", "Rkp", "rkp", "Al", "dr", "Pol",
    "Dev", "Bio", "Gral", "Depto", "Jr", "Dpt", "Avda", "ibs", "Depto", "Dir", "Ctra de", "Wm", "Pr",
    "Mfg", "Sct", "Ul", "Dm", "Kh", "ave", "Cra", "Reg", "Res", "gt", "Box", "Ltd", "Ss", " per",
    "Pvt", "Vas", "opg", "Prof", "Ac", "Ntra", "Sra", "e V"
]

In [19]:
old_longs = [
    "College of Medicine. Department", "Disorders Unit. Neurology Service", "Manchester. England", "Research Centre. School",
    "Glasgow. Glasgow", "Trondheim. Norway", "A Coruña. A Coruña", "Finland. Oulu", "Biostatistics. Boston", "Group. Rue",
    "Diabetes. Helmholtz", "Genetics. IMIM", "Epileptology. Clinic", "Liège. Domaine", "Fisioterapia. Facultad", "Durham. DH",
    "Surgery. University", "Medicine. Belfer", "Health. Boston", "Arrixaca. Murcia", "Epidemiology. Harvard", "Medicine. No",
    "Guanjuato. Campus León. León", "Therapies. Fundación", "California. San ", "Medicine. Durham", "University. Wangsimniro",
    "Metabolism, Uppsala", "Institute. Biomedicum", "Department. of", "Institutes. Keyan", "Center. Im Neuenheimer",
    "São Paulo. São Paulo", "Pública. Instituto", "Manchester. M GX", "Group. A Coruña", "Sistémicas. Universidad",
    "Institute. CEI", "Medicine. Bispebjerg", "Salud. Universidad", "Maule. Talca", "EA. Progression", "Hospital. Division",
    "Gasthuisberg. Cardiovascular", "Medicine. UK", "School. Dundee", "Unit. Hospital", "Mr. De Craen", "Sydney. New South",
    "Bank. Universidad", "BioBank. Ciencias", "Cambridge. Cambridge", "Aging. Leiden", "Practice. Great", "Center. Scientific",
    "Genetics. Department", "Diagnostics. School", "Province. Kunming", "Melbourne. Australia", "Bridgetown. Barbados",
    "Diseases. Virgen", "Hospital. Institute", "Movimiento. Instituto", "Cancer. Cancer", "Granada. ibsGRANADA",
    "Institute. SERGASUVIGO", "Iceland. Faculty", "Enfermería. Universidad", "Program. Catalan", "Metabolism. Uppsala"
]
new_longs = [
    "College of Medicine, Department", "Disorders Unit, Neurology Service", "Manchester, England", "Research Centre, School",
    "Glasgow, Glasgow", "Trondheim, Norway", "A Coruña, A Coruña", "Finland", "Biostatistics, Boston", "Group, Rue", 
    "Diabetes, Helmholtz", "Genetics, IMIM", "Epileptology, Clinic", "Liège, Domaine", "Fisioterapia, Facultad", "Durham, DH",
    "Surgery, University", "Medicine, Belfer", "Health, Boston", "Arrixaca, Murcia", "Epidemiology, Harvard", "Medicine No",
    "Guanjuato,s Campus León, León", "Therapies, Fundación", "California, San ", "Medicine, Durham", "University, Wangsimniro",
    "Metabolism, Uppsala", "Institute, Biomedicum", "Department of", "Institutes, Keyan", "Center, Im Neuenheimer",
    "São Paulo, São Paulo", "Pública, Instituto", "Manchester, M GX", "Group, A Coruña", "Sistémicas, Universidad",
    "Institute,  CEI", "Medicine, Bispebjerg", "Salud, Universidad", "Maule, Talca", "EA, Progression", "Hospital, Division",
    "Gasthuisberg, Cardiovascular", "Medicine, UK", "School, Dundee", "Unit, Hospital", "Mr De Craen", "Sydney, New South",
    "Bank, Universidad", "BioBank, Ciencias", "Cambridge, Cambridge", "Aging, Leiden", "Practice, Great", "Center, Scientific",
    "Genetics, Department", "Diagnostics, School", "Province, Kunming", "Melbourne, Australia", "Bridgetown, Barbados",
    "Diseases, Virgen", "Hospital,< Institute", "Movimiento, Instituto", "Cancer, Cancer", "Granada, ibsGRANADA",
    "Institute, SERGASUVIGO", "Iceland, Faculty", "Enfermería, Universidad", "Program, Catalan", "Metabolism, Uppsala"
]

In [20]:
need_points_old = [
    "USA Department", "USA Core", "Leicester, UK ", "USA Bureau", "UK CNRS", "UK Cardiac", "UK Center", "USA Broad", 
    "UK Department", "USA Analytic", "USA Novo", "LE QP, UK ", "London, UK ", "USA Computer", "USA Program", 
    "USA Cardiovascular", "Sweden Department", "USA Stanford Cancer", "USA Melvin", "USA Division", "USA Windland", 
    "SwedenClinical", "SwedenDepartment"
]
need_points_new = [
    "USA. Department", "USA. Core", "Leicester, UK. ", "USA. Bureau", "UK. CNRS", "UK. Cardiac", "UK. Center", "USA. Broad", 
    "UK. Department", "USA. Analytic", "USA. Novo", "LE QP, UK. ", "London, UK. ", "USA. Computer", "USA. Program", 
    "USA. Cardiovascular", "Sweden. Department", "USA. Stanford Cancer", "USA. Melvin", "USA. Division", "USA.  Windland", 
    "Sweden. Clinical", "SwedenDepartment"
]

In [21]:
non_aff_old = [
    "Both authors contributed equally", "Contributed equally and are considered to be joint last author",
    "Contributed equally to this article as lead authors and supervised the work", "Deceased: Dr Lumeng died on June",
    "Feipeng Cui and Yu Sun contributed equally to this work", "Equal firstsenior",
    "G Bergamini and R Pepperkok contributed equally to this article as lead authors and supervised the work",
    "From the Institute of Cardiovascular and Medical Sciences", "Lead contact",
    "Group leader of Angiogenesis and Tissue Engineering", "Joint first authors", "Joint senior authors"
]
non_aff_new = [""]*len(non_aff_old)

In [22]:
frst_affi_pipe = Pipeline([
    ('num', NumbersDropper().set_output(transform="pandas")),
    ('wsp', WhiteSpaces().set_output(transform="pandas")),
    ('exc1', ExcedingWhite().set_output(transform="pandas")),
    ('rep2', EndingReplacer(old=".,", new=",").set_output(transform="pandas")),
    ('rep3', EndingReplacer(old=",,", new=",").set_output(transform="pandas")),
    ('rep5', EndingReplacer(old="/", new="").set_output(transform="pandas")),
    ('hyph', EndingReplacer(old="-", new="").set_output(transform="pandas")),
    ('exc2', ExcedingWhite().set_output(transform="pandas")),
    ('replacer1', ListStrReplace(old=old_shorts, new=new_shorts).set_output(transform="pandas")),
    ('replacer2', ListStrReplace(old=old_longs, new=new_longs).set_output(transform="pandas")),
    ('replacer3', ListStrReplace(old=need_points_old, new=need_points_new).set_output(transform="pandas")),
    ('replacer4', ListStrReplace(old=non_aff_old, new=non_aff_new).set_output(transform="pandas")),
    ('scndaffi', SecondAffiSelector().set_output(transform="pandas"))
])

fst_aff_trans = ColumnTransformer([
    ('affi_pipe', frst_affi_pipe, ["first_affiliation"])
],
remainder="passthrough").set_output(transform="pandas")

In [23]:
df_2 = fst_aff_trans.fit_transform(df_1)
df_2.columns = recover_columns_names(df_2.columns)
df_2

Unnamed: 0,first_affiliation,ult_affiliation,Affiliations,PubMed ID,Title,Publication Year,Author,Country
0,"Department of Geriatrics, Institute of Geronto...","Department of Geriatrics, Institute of Geronto...","Department of Geriatrics, Institute of Geronto...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
1,"Key Laboratory of Vascular Aging, Ministry of ...","Key Laboratory of Vascular Aging, Ministry of ...","Key Laboratory of Vascular Aging, Ministry of ...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
2,"Department of Geriatrics, Institute of Geronto...","Department of Geriatrics, Institute of Geronto...","Department of Geriatrics, Institute of Geronto...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
3,"Key Laboratory of Vascular Aging, Ministry of ...","Key Laboratory of Vascular Aging, Ministry of ...","Key Laboratory of Vascular Aging, Ministry of ...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
4,Division of Cardiothoracic and Vascular Surger...,Division of Cardiothoracic and Vascular Surger...,Division of Cardiothoracic and Vascular Surger...,38853292,"Physical frailty, genetic predisposition, and ...",2024,Jiajun Li,China.
...,...,...,...,...,...,...,...,...
187548,"St George's Hospital Medical School, and St Ge...","St George's Hospital Medical School, and St Ge...","St George's Hospital Medical School, and St Ge...",12848262,Polymorphic sequence variants in medicine: a c...,2003,Shirley V Hodgson,London. s.hodgson@sghms.ac.uk
187549,"The Lancet, Jamestown Road, NW BY, London, UK","The Lancet, Jamestown Road, NW BY, London, UK","The Lancet, 32 Jamestown Road, NW1 7BY, London...",12767753,UK Biobank: a project in search of a protocol?,2003,Virginia Barbour,UK. virginia.barbour@lancet.com <virginia.barb...
187550,"Cellbank, Department of Genetics and Mutagenes...","Cellbank, Department of Genetics and Mutagenes...","Cellbank, Department of Genetics and Mutagenes...",12693011,"[Ethical, legal, and social issues of genome r...",2003,Tohru Masui,Japan. masui@nihs.go.jp
187551,"MRC Human Genetics Unit, Western General Hospi...","MRC Human Genetics Unit, Western General Hospi...","MRC Human Genetics Unit, Western General Hospi...",12049178,Gene-environment interactions--the BioBank UK ...,2002,A F Wright,UK. alan.wright@hgu.mrc.ac.uk


In [24]:
last_pipe = Pipeline([
    ('country', CountrySelector().set_output(transform="pandas"))
])

country_trans = ColumnTransformer([
    ('ctry_sel', last_pipe, ['ult_affiliation'])
], 
remainder="passthrough").set_output(transform="pandas")

In [25]:
df_3 = country_trans.fit_transform(df_2)
df_3.columns = recover_columns_names(df_3.columns)
df_3

Unnamed: 0,ult_affiliation,country,first_affiliation,Affiliations,PubMed ID,Title,Publication Year,Author,Country
0,"Department of Geriatrics, Institute of Geronto...",China,"Department of Geriatrics, Institute of Geronto...","Department of Geriatrics, Institute of Geronto...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
1,"Key Laboratory of Vascular Aging, Ministry of ...",China,"Key Laboratory of Vascular Aging, Ministry of ...","Key Laboratory of Vascular Aging, Ministry of ...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Yucong Zhang,China.
2,"Department of Geriatrics, Institute of Geronto...",China,"Department of Geriatrics, Institute of Geronto...","Department of Geriatrics, Institute of Geronto...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
3,"Key Laboratory of Vascular Aging, Ministry of ...",China,"Key Laboratory of Vascular Aging, Ministry of ...","Key Laboratory of Vascular Aging, Ministry of ...",38853292,"Physical frailty, genetic predisposition, and ...",2024,Man Liu,China.
4,Division of Cardiothoracic and Vascular Surger...,China,Division of Cardiothoracic and Vascular Surger...,Division of Cardiothoracic and Vascular Surger...,38853292,"Physical frailty, genetic predisposition, and ...",2024,Jiajun Li,China.
...,...,...,...,...,...,...,...,...,...
187548,"St George's Hospital Medical School, and St Ge...",London,"St George's Hospital Medical School, and St Ge...","St George's Hospital Medical School, and St Ge...",12848262,Polymorphic sequence variants in medicine: a c...,2003,Shirley V Hodgson,London. s.hodgson@sghms.ac.uk
187549,"The Lancet, Jamestown Road, NW BY, London, UK",UK,"The Lancet, Jamestown Road, NW BY, London, UK","The Lancet, 32 Jamestown Road, NW1 7BY, London...",12767753,UK Biobank: a project in search of a protocol?,2003,Virginia Barbour,UK. virginia.barbour@lancet.com <virginia.barb...
187550,"Cellbank, Department of Genetics and Mutagenes...",Tokyo Japan,"Cellbank, Department of Genetics and Mutagenes...","Cellbank, Department of Genetics and Mutagenes...",12693011,"[Ethical, legal, and social issues of genome r...",2003,Tohru Masui,Japan. masui@nihs.go.jp
187551,"MRC Human Genetics Unit, Western General Hospi...",UK,"MRC Human Genetics Unit, Western General Hospi...","MRC Human Genetics Unit, Western General Hospi...",12049178,Gene-environment interactions--the BioBank UK ...,2002,A F Wright,UK. alan.wright@hgu.mrc.ac.uk


In [26]:
df_3.to_csv(
    os.path.join(FOLDER, 'publications_affi_cleaned.csv'), index=False
)