In [None]:
import sys
import os

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# add parent directory to path\n",
library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

from paper_fetcher.affi_cleaner import EmailDropper, FirstNonLetter, ElectronicAddressDropper, SubstringDropper

from paper_fetcher.utils import recover_columns_names

In [None]:
FOLDER = '/media/luis/LaCie/data1/PubMedPapers/'

df_all = pd.read_csv(
    os.path.join(FOLDER, 'publications_filtered_v6.csv')
)

df_all["Affiliations"] = df_all["Affiliations"].astype(str)
df_base = df_all[['PubMed ID', 'Author', 'Affiliations']].copy()
df_base

In [None]:
affi_pipe = Pipeline([
    ('fst2', FirstNonLetter().set_output(transform="pandas")),
    ('email', EmailDropper().set_output(transform="pandas")),
    ('elec', ElectronicAddressDropper().set_output(transform="pandas")),
])

aff_trans = ColumnTransformer([
    ('affi_pipe', affi_pipe, ["Affiliations"])
], remainder="passthrough")

aff_trans.set_output(transform="pandas")

In [None]:
df_base = aff_trans.fit_transform(df_base)
df_base.columns = recover_columns_names(df_base)
df_base

In [None]:
semicolons = df_base[df_base['Affiliations'].str.contains(';')].reset_index(drop=True)
semicolons["Affiliations"] = semicolons["Affiliations"].apply(lambda x: x.strip(' ').rstrip('.').rstrip(';'))

In [None]:
from paper_fetcher.affi_cleaner import ParenthesisDropper, SubstringDropper, StrReplace

substrings = Pipeline([
    ('par', ParenthesisDropper().set_output(transform="pandas")),
    ('sub1', SubstringDropper(substring='No.2').set_output(transform="pandas")),
    ('sub2', SubstringDropper(substring='No.2').set_output(transform="pandas")),
    ('usa', StrReplace(old='U.S.A.', new='USA').set_output(transform="pandas")),
    ('prc', StrReplace(old='P.R. China', new='China').set_output(transform="pandas")),
])

subs_trans = ColumnTransformer([
    ('subs_pipe', substrings, ["Affiliations"])
], remainder="passthrough")

subs_trans.set_output(transform="pandas")

In [None]:
semicolons = subs_trans.fit_transform(semicolons)
semicolons.columns = recover_columns_names(semicolons)

In [None]:
semicolons['Affiliations'] = semicolons['Affiliations'].str.split(';')
semicolons = semicolons.explode('Affiliations').reset_index(drop=True)
semicolons

In [None]:
no_semicolons = df_base[~df_base['Affiliations'].str.contains(';')].reset_index(drop=True)
no_semicolons["Affiliations"] = no_semicolons["Affiliations"].apply(lambda x: x.strip(' ').rstrip('.').rstrip(';'))

In [None]:
no_semicolons = subs_trans.fit_transform(no_semicolons)
no_semicolons.columns = recover_columns_names(no_semicolons)
no_semicolons

In [None]:
semicolons.to_csv(os.path.join(FOLDER, 'publications_filtered_v7_semicolons.csv'), index=False)
no_semicolons.to_csv(os.path.join(FOLDER, 'publications_filtered_v7_no_semicolons.csv'), index=False)