In [1]:
import pandas as pd
import numpy as np

import time
import pickle
import memory_profiler

%load_ext memory_profiler
from pathlib import Path

In [2]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [3]:
%load_ext autoreload
%autoreload 2

from pubmed_landscape_src.data import import_all_files, xml_import

In [4]:
variables_path = Path("../../results/variables/2024_baseline")
figures_path = Path("../../results/figures/2024_baseline")
berenslab_data_path = Path("/gpfs01/berens/data/data/pubmed_processed")

We extract from PubMed's metadata `.xml` files the following:
- PubMed ID
- title
- abstract
- language
- journal title
- ISSN
- publication date
- (first and last) author first names
- (first and last) author Affiliations


# Parse data

## 2024 baseline

In [None]:
%%time

path = "/gpfs01/berens/data/data/pubmed/2024_baseline/"

files_2024_df = import_all_files(path, order_files=True)

# save results
files_2024_df.to_pickle(variables_path / "files_2024_df")

pubmed24n0001.xml
pubmed24n0002.xml
pubmed24n0003.xml
pubmed24n0004.xml
pubmed24n0005.xml
pubmed24n0006.xml
pubmed24n0007.xml
pubmed24n0008.xml
pubmed24n0009.xml
pubmed24n0010.xml
pubmed24n0011.xml
pubmed24n0012.xml
pubmed24n0013.xml
pubmed24n0014.xml
pubmed24n0015.xml
pubmed24n0016.xml
pubmed24n0017.xml
pubmed24n0018.xml
pubmed24n0019.xml
pubmed24n0020.xml
pubmed24n0021.xml
pubmed24n0022.xml
pubmed24n0023.xml
pubmed24n0024.xml
pubmed24n0025.xml
pubmed24n0026.xml
pubmed24n0027.xml
pubmed24n0028.xml
pubmed24n0029.xml
pubmed24n0030.xml
pubmed24n0031.xml
pubmed24n0032.xml
pubmed24n0033.xml
pubmed24n0034.xml
pubmed24n0035.xml
pubmed24n0036.xml
pubmed24n0037.xml
pubmed24n0038.xml
pubmed24n0039.xml
pubmed24n0040.xml
pubmed24n0041.xml
pubmed24n0042.xml
pubmed24n0043.xml
pubmed24n0044.xml
pubmed24n0045.xml
pubmed24n0046.xml
pubmed24n0047.xml
pubmed24n0048.xml
pubmed24n0049.xml
pubmed24n0050.xml
pubmed24n0051.xml
pubmed24n0052.xml
pubmed24n0053.xml
pubmed24n0054.xml
pubmed24n0055.xml
pubmed24n0

In [13]:
print("There are {} papers".format(files_2024_df.shape[0]))

There are 36555430 papers


In [19]:
files_2024_df.to_pickle(
    "/gpfs01/berens/data/data/pubmed_processed/files_2024_df"
)

# Explore

In [14]:
# non-english papers
files_2024_df_non_eng = files_2024_df[files_2024_df.Language != "eng"]

# print size
print("There are {} non-english papers".format(files_2024_df_non_eng.shape[0]))

There are 4845747 non-english papers


In [15]:
# empty abstracts
files_2024_df_empty = files_2024_df[files_2024_df.AbstractText == ""]

# print size
print("There are {} empty abstracts".format(files_2024_df_empty.shape[0]))

There are 11275594 empty abstracts


In [16]:
# empty abstracts
files_2024_df_not_empty = files_2024_df[files_2024_df.AbstractText != ""]

# print size
print(
    "There are {} not empty abstracts".format(files_2024_df_not_empty.shape[0])
)

There are 25279836 not empty abstracts


In [17]:
# empty abstracts
files_2024_df_english_not_empty = files_2024_df[
    (files_2024_df.AbstractText != "") & (files_2024_df.Language == "eng")
]

# print size
print(
    "There are {} not empty english abstracts".format(
        files_2024_df_english_not_empty.shape[0]
    )
)

There are 23677538 not empty english abstracts


### Cut off = 4000

In [18]:
print(
    "Before cut off, there are {} papers".format(
        files_2024_df_english_not_empty.shape[0]
    )
)
abstracts = files_2024_df_english_not_empty["AbstractText"].tolist()
len_strings = map(len, abstracts)
len_abstracts = np.fromiter(len_strings, dtype=np.int64, count=len(abstracts))


cut_off = 4000
files_2024_df_cutoff = files_2024_df_english_not_empty[len_abstracts < cut_off]
print(
    "After cut off, there are {} papers".format(files_2024_df_cutoff.shape[0])
)

Before cut off, there are 23677538 papers
After cut off, there are 23660437 papers


### Threshold = 250

In [20]:
threshold = 250
len_short_abstracts = len_abstracts[len_abstracts < cut_off]
files_2024_df_cutoff_threshold = files_2024_df_cutoff[
    len_short_abstracts > threshold
]
print(
    "After threshold, there are {} papers".format(
        files_2024_df_cutoff_threshold.shape[0]
    )
)

After threshold, there are 23416995 papers


# Filter

Filter out:
- non-English papers
- papers with empty abstracts
- papers with abstracts shorter than 250 or longer than 4000 symbols
- papers with unfinished abstracts

In [5]:
# import files_2024_df
files_2024_df = pd.read_pickle(variables_path / "files_2024_df")

In [20]:
files_2024_df.head()

Unnamed: 0,PMID,Title,AbstractText,Language,Journal,Date,NameFirstAuthor,NameLastAuthor,ISSN,AffiliationFirstAuthor,AffiliationLastAuthor,filename
0,1,Formate assay in body fluids: application in m...,,eng,Biochemical medicine,1975 Jun,A B,T R,0006-2944,,,pubmed24n0001.xml
1,2,Delineation of the intimate details of the bac...,,eng,Biochemical and biophysical research communica...,1975 Oct 27,K S,R H,1090-2104,,,pubmed24n0001.xml
2,3,Metal substitutions incarbonic anhydrase: a ha...,,eng,Biochemical and biophysical research communica...,1975 Oct 27,R J,R G,0006-291X,,,pubmed24n0001.xml
3,5,Atomic models for the polypeptide backbones of...,,eng,Biochemical and biophysical research communica...,1975 Oct 27,W A,K B,1090-2104,,,pubmed24n0001.xml
4,4,Effect of chloroquine on cultured fibroblasts:...,,eng,Biochemical and biophysical research communica...,1975 Oct 27,U N,N N,1090-2104,,,pubmed24n0001.xml


## Empty abstracts and non-english papers

In [6]:
%%time

print("Before, there were {} papers".format(files_2024_df.shape[0]))

# Eliminate empty abstracts
clean_2024_df = files_2024_df[files_2024_df.AbstractText != ""]

print(
    "After eliminating empty abstracts, there are {} papers".format(
        clean_2024_df.shape[0]
    )
)

# Eliminate non-english papers
clean_2024_df = clean_2024_df[clean_2024_df.Language == "eng"]

# print size
print(
    "After first cleaning, there are {} papers".format(clean_2024_df.shape[0])
)

Before, there were 36555430 papers
After eliminating empty abstracts, there are 25279836 papers
After first cleaning, there are 23677538 papers
CPU times: user 17.4 s, sys: 713 ms, total: 18.2 s
Wall time: 18 s


## Threshold and cut off

### Cut off = 4000

In [7]:
print("Before cut off, there are {} papers".format(clean_2024_df.shape[0]))
abstracts = clean_2024_df["AbstractText"].tolist()
len_strings = map(len, abstracts)
len_abstracts = np.fromiter(len_strings, dtype=np.int64, count=len(abstracts))


cut_off = 4000
clean_2024_df = clean_2024_df[len_abstracts < cut_off]
print("After cut off, there are {} papers".format(clean_2024_df.shape[0]))

Before cut off, there are 23677538 papers
After cut off, there are 23660437 papers


### Threshold = 250

In [8]:
threshold = 250
len_short_abstracts = len_abstracts[len_abstracts < cut_off]
clean_2024_df = clean_2024_df[len_short_abstracts > threshold]
print("After threshold, there are {} papers".format(clean_2024_df.shape[0]))

After threshold, there are 23416995 papers


## Remove the truncated sentence from abstracts


In [9]:
abstracts = clean_2024_df["AbstractText"]
abstracts_list = clean_2024_df["AbstractText"].tolist()

In [10]:
%%time
# BUENO
clean_2024_df.AbstractText = list(
    map(
        lambda x, y: x[: y - 1] if y != -1 else x,
        clean_2024_df.AbstractText,
        clean_2024_df.AbstractText.str.find("ABSTRACT TRUNCATED AT"),
    )
)

CPU times: user 30.6 s, sys: 841 ms, total: 31.5 s
Wall time: 31.4 s


In [11]:
clean_2024_df.shape

(23416995, 12)

## Remove unfinished abstracts

In [12]:
abstracts = clean_2024_df["AbstractText"]
abstracts_list = clean_2024_df["AbstractText"].tolist()
print(len(abstracts_list))

23416995


In [13]:
%%time
end_abstracts = [x[-2:] for x in abstracts_list]

CPU times: user 4.58 s, sys: 691 ms, total: 5.27 s
Wall time: 5.27 s


In [14]:
%%time
point_index = np.array([x.find(".") for x in end_abstracts])
question_index = np.array([x.find("?") for x in end_abstracts])
exclamation_index = np.array([x.find("!") for x in end_abstracts])

CPU times: user 9.85 s, sys: 104 ms, total: 9.96 s
Wall time: 9.95 s


In [15]:
%%time

print("Before cleaning, there are {} papers".format(clean_2024_df.shape[0]))

# Eliminate unfinished abstracts
clean_2024_df = clean_2024_df[
    (point_index != -1) | (question_index != -1) | (exclamation_index != -1)
]

# print size
print("After cleaning, there are {} papers".format(clean_2024_df.shape[0]))

Before cleaning, there are 23416995 papers
After cleaning, there are 23389083 papers
CPU times: user 7.63 s, sys: 399 ms, total: 8.03 s
Wall time: 7.98 s


In [16]:
clean_2024_df.shape

(23389083, 12)

In [18]:
# save df
# clean_2024_df.to_pickle(variables_path / "clean_2024_df")
clean_2024_df.to_pickle(
    "/gpfs01/berens/data/data/pubmed_processed/clean_2024_df"
)

In [23]:
# import files_2024_df
clean_2024_df = pd.read_pickle(variables_path / "clean_2024_df")

In [24]:
clean_2024_df.head()

Unnamed: 0,PMID,Title,AbstractText,Language,Journal,Date,NameFirstAuthor,NameLastAuthor,ISSN,AffiliationFirstAuthor,AffiliationLastAuthor,filename
21,24,Influence of a new virostatic compound on the ...,"The virostatic compound N,N-diethyl-4-[2-(2-ox...",eng,Arzneimittel-Forschung,1975 Sep,H,G,0004-4172,,,pubmed24n0001.xml
22,23,Effect of etafenone on total and regional myoc...,The distribution of blood flow to the subendoc...,eng,Arzneimittel-Forschung,1975 Sep,H,W,0004-4172,,,pubmed24n0001.xml
24,25,Pharmacological properties of new neuroleptic ...,"RMI 61 140, RMI 61 144 and RMI 61 280 are newl...",eng,Arzneimittel-Forschung,1975 Sep,L,A,0004-4172,,,pubmed24n0001.xml
29,30,Lysosomal hydrolases of the epidermis. I. Glyc...,Seven distinct glycosidases (EC 3.2) have been...,eng,The British journal of dermatology,1975 Jul,P D,J J,0007-0963,,,pubmed24n0001.xml
31,32,A serum haemagglutinating property dependent u...,A serum agglutinin reactive with red cells in ...,eng,British journal of haematology,1975 Jan,M L,W L,0007-1048,,,pubmed24n0001.xml
