In [53]:
import sys
import os

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# add parent directory to path\n",
library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

from paper_fetcher.affi_cleaner import EmailDropper, FirstNonLetter, StrReplace

from paper_fetcher.utils import recover_columns_names

In [54]:
FOLDER = '/media/luisggon/LaCie/data1/PubMedPapers'

df_all = pd.read_csv(
    os.path.join(FOLDER, 'publications_filtered_v6.csv')
)

df_all["Affiliations"] = df_all["Affiliations"].astype(str)
df_base = df_all[['PubMed ID', 'Author', 'Affiliations']].copy()
df_base

Unnamed: 0,PubMed ID,Author,Affiliations
0,38853292,Yucong Zhang,"Department of Geriatrics, Institute of Geronto..."
1,38853292,Yucong Zhang,"Key Laboratory of Vascular Aging, Ministry of ..."
2,38853292,Man Liu,"Department of Geriatrics, Institute of Geronto..."
3,38853292,Man Liu,"Key Laboratory of Vascular Aging, Ministry of ..."
4,38853292,Jiajun Li,Division of Cardiothoracic and Vascular Surger...
...,...,...,...
187548,12848262,Shirley V Hodgson,"St George's Hospital Medical School, and St Ge..."
187549,12767753,Virginia Barbour,"The Lancet, 32 Jamestown Road, NW1 7BY, London..."
187550,12693011,Tohru Masui,"Cellbank (JCRB), Department of Genetics and Mu..."
187551,12049178,A F Wright,"MRC Human Genetics Unit, Western General Hospi..."


In [55]:
df_base =  df_base[df_base['PubMed ID']!=30260755].reset_index(drop=True)

In [56]:
affi_pipe = Pipeline([
    ('fst2', FirstNonLetter().set_output(transform="pandas")),
    ('email', EmailDropper().set_output(transform="pandas")),
    ('rep20', StrReplace(
        old="Social, Genetic and Developmental Psychiatry Centre; Institute of Psychiatry, Psychology & Neuroscience, King's College London", 
        new="Social, Genetic Developmental Psychiatry Centre, Institute of Psychiatry, Psychology & Neuroscience, King's College London").set_output(transform="pandas")
    ),
    ('rep21', StrReplace(
        old="Center for Genetic Epidemiology and Genomics (X.-B.M.)",
        new="Center for Genetic Epidemiology and Genomics, China").set_output(transform="pandas")
    ),
    ('rep22', StrReplace(
        old='National Center of Competence in Research "Kidney.CH"',
        new="National Center of Competence in Research Kidney.CH, Switzerland").set_output(transform="pandas")
    ),
    ('rep23', StrReplace(
        old="Department of Biomedical Sciences; Department of Internal Medicine, Armed Forces Capital Hospital",
        new="Department of Biomedical Sciences, Department of Internal Medicine, Armed Forces Capital Hospital, Gyeonggi-do, Korea.").set_output(transform="pandas")
    ),
    ('rep24', StrReplace(
        old="Centre for Innovation and Precision Eye Health; and Department of Ophthalmology",
        new="Centre for Innovation and Precision Eye Health, and Department of Ophthalmology").set_output(transform="pandas")
    ),
    ('rep29', StrReplace(
        old='Section of Gerontology and Geriatrics; Leiden University ',
        new='Section of Gerontology and Geriatrics, Leiden University ').set_output(transform="pandas")),
    ('rep30', StrReplace(
        old="Department of Internal Medicine B - Cardiology, Pneumology, Infectious Diseases, Intensive Care Medicine;",
        new="Department of Internal Medicine B - Cardiology, Pneumology, Infectious Diseases, Intensive Care Medicine,").set_output(transform="pandas")),
    ('elec', StrReplace(old='Electronic address:', new="").set_output(transform="pandas")),
    ('rep1', StrReplace(old='&#x2019;', new="'").set_output(transform="pandas")),
    ('rep2', StrReplace(old='&apos;', new="'").set_output(transform="pandas")),
    ('rep3', StrReplace(old=' and', new="").set_output(transform="pandas")),
    ('rep4', StrReplace(old=' the', new="").set_output(transform="pandas")),
    ('rep5', StrReplace(old='from ', new="").set_output(transform="pandas")),
    ('rep6', StrReplace(old='From ', new="").set_output(transform="pandas")),
    ('rep7', StrReplace(old=';;', new=";").set_output(transform="pandas")),
    ('rep8', StrReplace(old='&amp;', new="").set_output(transform="pandas")),
    ('rep9', StrReplace(old='Université; de', new="Université de").set_output(transform="pandas")),
    ('rep10', StrReplace(old='Rehabilitation in Multiple Sclerosis', new="").set_output(transform="pandas")),
    ('rep11', StrReplace(old='E-mail:', new="").set_output(transform="pandas")),
    ('rep12', StrReplace(old='Fax: 82-2-393-6884', new="").set_output(transform="pandas")),
    ('rep13', StrReplace(old='email:', new="").set_output(transform="pandas")),
    ('rep14', StrReplace(old='Email:', new="").set_output(transform="pandas")),
    ('rep15', StrReplace(old='Department of Biostatistics Epidemiology;', new="").set_output(transform="pandas")),
    ('rep16', StrReplace(old='Università;', new="Università").set_output(transform="pandas")),
    ('rep17', StrReplace(old='(CIBERONC)', new=", Madrid").set_output(transform="pandas")),
    ('rep18', StrReplace(old='(CNIO),', new=", Madrid").set_output(transform="pandas")),
    ('rep19', StrReplace(
        old="Social, Genetic Developmental Psychiatry Centre; Institute of Psychiatry, Psychology & Neuroscience; King's College London", 
        new="Social, Genetic Developmental Psychiatry Centre, Institute of Psychiatry, Psychology & Neuroscience, King's College London").set_output(transform="pandas")
    ),
    ('rep25', StrReplace(old='NIMTLab', new="NIMTLab, Switzerland").set_output(transform="pandas")),
    ('rep26', StrReplace(old='Deputy Scientific Secretary IOIBD;', new="").set_output(transform="pandas")),
    ('rep27', StrReplace(old='Chair, IOIBD;', new="").set_output(transform="pandas")),
    ('rep28', StrReplace(
        old='Department of Internal Medicine; Kidney Research Institute,', 
        new="Department of Internal Medicine, Kidney Research Institute,").set_output(transform="pandas")),  
])

aff_trans = ColumnTransformer([
    ('affi_pipe', affi_pipe, ["Affiliations"])
], remainder="passthrough")

aff_trans.set_output(transform="pandas")

In [57]:
df_base = aff_trans.fit_transform(df_base)
df_base.columns = recover_columns_names(df_base)
df_base

Unnamed: 0,Affiliations,PubMed ID,Author
0,"Department of Geriatrics, Institute of Geronto...",38853292,Yucong Zhang
1,"Key Laboratory of Vascular Aging, Ministry of ...",38853292,Yucong Zhang
2,"Department of Geriatrics, Institute of Geronto...",38853292,Man Liu
3,"Key Laboratory of Vascular Aging, Ministry of ...",38853292,Man Liu
4,"Division of Cardiothoracic Vascular Surgery, T...",38853292,Jiajun Li
...,...,...,...
187469,"St George's Hospital Medical School, St George...",12848262,Shirley V Hodgson
187470,"The Lancet, 32 Jamestown Road, NW1 7BY, London...",12767753,Virginia Barbour
187471,"Cellbank (JCRB), Department of Genetics Mutage...",12693011,Tohru Masui
187472,"MRC Human Genetics Unit, Western General Hospi...",12049178,A F Wright


In [58]:
semicolons = df_base[df_base['Affiliations'].str.contains(';')].reset_index(drop=True)
#semicolons["Affiliations"] = semicolons["Affiliations"].apply(lambda x: x.strip(' ').rstrip('.').rstrip(';').rstrip('.'))

In [59]:
from paper_fetcher.affi_cleaner import ParenthesisDropper, SubstringDropper, StrReplace

substrings = Pipeline([
    ('par', ParenthesisDropper().set_output(transform="pandas")),
    ('sub1', SubstringDropper(substring='No.2').set_output(transform="pandas")),
    ('sub2', SubstringDropper(substring='No.2').set_output(transform="pandas")),
    ('usa', StrReplace(old='U.S.A.', new='USA').set_output(transform="pandas")),
    ('prc', StrReplace(old='P.R. China', new='China').set_output(transform="pandas")),
])

subs_trans = ColumnTransformer([
    ('subs_pipe', substrings, ["Affiliations"])
], remainder="passthrough")

subs_trans.set_output(transform="pandas")

In [60]:
semicolons = subs_trans.fit_transform(semicolons)
semicolons.columns = recover_columns_names(semicolons)

import re

def remove_punctuation_edges(text: str) -> str:
    if text is None:
        return None
    
    almost_clean = re.sub(r'^[^\w\s]+|[^\w\s]+$', '', text.strip(' '))

    return almost_clean.strip(' ').strip(';')

semicolons['Affiliations'] = semicolons['Affiliations'].apply(lambda x: remove_punctuation_edges(x))

In [61]:
semicolons

Unnamed: 0,Affiliations,PubMed ID,Author
0,"Mackenzie Wearables Research Hub, Charles Perk...",38852004,Le Wei
1,"Mackenzie Wearables Research Hub, Charles Perk...",38852004,Matthew N Ahmadi
2,"School of Human Movement Nutrition Sciences, T...",38852004,Stewart Trost
3,"Mackenzie Wearables Research Hub, Charles Perk...",38852004,Emmanuel Stamatakis
4,"Department of Clinical Epidemiology, Shengjing...",38849058,Rongchang Guo
...,...,...,...
9063,"Inserm , CESP , U1018, Environmental Epidemiol...",23423446,Pascal Guénel
9064,"Program in Molecular Genetic Epidemiology, Har...",23423446,Peter Kraft
9065,"School of Public Health, Peking University, Xu...",22325671,L Li
9066,"Nuffield Department of Obstetrics Gynaecology,...",25328660,Stephen H Kennedy


In [62]:
semicolons['Affiliations'] = semicolons['Affiliations'].str.split(';')
semicolons = semicolons.explode('Affiliations').reset_index(drop=True)
semicolons

Unnamed: 0,Affiliations,PubMed ID,Author
0,"Mackenzie Wearables Research Hub, Charles Perk...",38852004,Le Wei
1,"School of Health Sciences, Faculty of Medicin...",38852004,Le Wei
2,"Mackenzie Wearables Research Hub, Charles Perk...",38852004,Matthew N Ahmadi
3,"School of Health Sciences, Faculty of Medicin...",38852004,Matthew N Ahmadi
4,"School of Human Movement Nutrition Sciences, T...",38852004,Stewart Trost
...,...,...,...
48691,"Chinese Academy of Medical Sciences, Beijing,...",22325671,L Li
48692,"Nuffield Department of Obstetrics Gynaecology,...",25328660,Stephen H Kennedy
48693,"World Endometriosis Research Foundation, Lond...",25328660,Stephen H Kennedy
48694,"Nuffield Department of Obstetrics Gynaecology,...",25328660,Krina T Zondervan


In [63]:
no_semicolons = df_base[~df_base['Affiliations'].str.contains(';')].reset_index(drop=True)
no_semicolons["Affiliations"] = no_semicolons["Affiliations"].apply(lambda x: x.strip(' ').rstrip('.').rstrip(';'))

In [64]:
no_semicolons = subs_trans.fit_transform(no_semicolons)
no_semicolons.columns = recover_columns_names(no_semicolons)
no_semicolons

Unnamed: 0,Affiliations,PubMed ID,Author
0,"Department of Geriatrics, Institute of Geronto...",38853292,Yucong Zhang
1,"Key Laboratory of Vascular Aging, Ministry of ...",38853292,Yucong Zhang
2,"Department of Geriatrics, Institute of Geronto...",38853292,Man Liu
3,"Key Laboratory of Vascular Aging, Ministry of ...",38853292,Man Liu
4,"Division of Cardiothoracic Vascular Surgery, T...",38853292,Jiajun Li
...,...,...,...
178401,"St George's Hospital Medical School, St George...",12848262,Shirley V Hodgson
178402,"The Lancet, 32 Jamestown Road, NW1 7BY, London...",12767753,Virginia Barbour
178403,"Cellbank , Department of Genetics Mutagenesis,...",12693011,Tohru Masui
178404,"MRC Human Genetics Unit, Western General Hospi...",12049178,A F Wright


In [65]:
semicolons.to_csv(os.path.join(FOLDER, 'publications_filtered_v7_semicolons.csv'), index=False)
no_semicolons.to_csv(os.path.join(FOLDER, 'publications_filtered_v7_no_semicolons.csv'), index=False)