In [1]:
'''
 * @ Author: Yohei Ohto
 * @ Create Time: 2024-11-26 17:36:55
 * @ Modified time: 2024-11-26 18:33:40
 * @ Description: ftpでDLしたPubMedのデータを.dbに加工する
 '''

'\n * @ Author: Yohei Ohto\n * @ Create Time: 2024-11-26 17:36:55\n * @ Modified time: 2024-11-26 18:33:40\n * @ Description: ftpでDLしたPubMedのデータを.dbに加工する\n '

In [2]:
import xml.etree.ElementTree as ET
import sqlite3

In [3]:
tree = ET.parse(f"/workspace/HDD_ohto/01-DATA/pubmed_22/PubMed/pubmed22n0001.xml")
root = tree.getroot()

# 階層構造を可視化する
何が入っているかを確認し、エレメントの詳細を確認する  
各エレメントの紹介は以下のページに存在  
https://wayback.archive-it.org/org-350/20240220194809/https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html

## other abstは一旦含まない。

## TRANCTUATEDのデータは省略したという文言のみを削除して使用
ABSTRACT TRUNCATED AT 250 WORDS  
ABSTRACT TRUNCATED AT 400 WORDS  
ABSTRACT TRUNCATED (このメッセージは、1996 年に最大長が 4,096 文字に引き上げられてから、まれに表示されました。)  
2000 年以降に作成されたレコードの要約の最大長は 10,000 文字です  
→ 古い記事だから学習に使用しないもあり

## AbstTextが複数ある場合 (Abstが章立てされているなどの場合)は一つの文字列にする

# PMIDはおそらくすべての記事(Abstのない記事)にも存在している → 全てのPMIDを取ってきて、Abstractは欠損値を許可する
abstが英語であるかの判定を入れる　→ 参考　https://qiita.com/yuichi0625/items/c5ffd6f45a7cf30c9477

In [8]:
import ftlangdetect

def identify_eng_by_fasttext(data: str) -> str:
    if ftlangdetect.detect(data, low_memory=False)['lang'] == "en":
        result = 1 
    else:
        result = 0
    return result

In [36]:
pmids = []
absts = []
abst_eng = []

for article in tqdm(root):
    abst_text = ""
    for citation in article:
        for round3 in citation:
            if round3.tag == "PMID":
                pmid = int(round3.text)
            if round3.tag == "Article":
                for round4 in round3:
                    if round4.tag == "Abstract":
                        for round5 in round4:
                            if round5.tag == "AbstractText":
                                text = str(round5.text)
                                text.replace('(ABSTRACT TRUNCATED AT 250 WORDS)', '').replace('(ABSTRACT TRUNCATED AT 400 WORDS)', '').replace('(ABSTRACT TRUNCATED)', '')
                                abst_text += (text + " ") 
    if pmid != None:
        pmids.append(pmid)
        if abst_text != "":
            absts.append(abst_text.rstrip(" "))
            abst_eng.append(identify_eng_by_fasttext(abst_text))
        else:
            absts.append("")
            abst_eng.append(0)

  0%|          | 141/30000 [00:00<00:21, 1409.33it/s]

100%|██████████| 30000/30000 [00:02<00:00, 10364.02it/s]


# その他の必要そうな情報もまとめておく
著者名は人数のばらつきが大きそうで、欠損値が多くなりそうなので一旦なし  
other abstも一旦なし  
  
基本的にjournalの情報がメインになっている

In [32]:
import datetime
from tqdm import tqdm

In [34]:
issns = []
title_as = []
title_js = []
years = []
months = []
for article in tqdm(root):
    issn = ""
    title_j = ""
    year = 0
    month = 0
    title_a = ""
    
    for citation in article:
        for round3 in citation:
            if round3.tag == "Article":
                for round4 in round3:
                    if round4.tag == "Journal":
                        for round5 in round4:
                            if round5.tag == "ISSN":
                                issn = round5.text
                            if round5.tag == "Title":
                                title_j = round5.text
                            if round5.tag == "JournalIssue":
                                for round6 in round5:
                                    if round6.tag == "PubDate":
                                        for round7 in round6:
                                            if round7.tag == "Year":
                                                year = int(round7.text)
                                            if round7.tag == "Month":
                                                month = round7.text
                                                try:
                                                    month = int(month)
                                                except:
                                                    month = int(datetime.datetime.strptime(month, "%b").month)

                    if round4.tag == "ArticleTitle":
                        title_a = round4.text
    
    issns.append(issn)
    title_as.append(title_a)
    title_js.append(title_j)
    years.append(year)
    months.append(month)

  0%|          | 0/30000 [00:00<?, ?it/s]

100%|██████████| 30000/30000 [00:00<00:00, 33041.06it/s]


# 上の二つをまとめ、すべてのファイルで実行する

In [40]:
import glob

In [42]:
paths = glob.glob("/workspace/HDD_ohto/01-DATA/pubmed_22/PubMed/*.xml")
print(len(paths))

1114


In [None]:
issns = []
title_as = []
title_js = []
years = []
months = []

pmids = []
absts = []
abst_eng = []


for path in tqdm(paths):
    tree = ET.parse(path)
    root = tree.getroot()
    
    for article in root:
        abst_text = ""
        issn = ""
        title_j = ""
        year = 0
        month = 0
        title_a = ""

        for citation in article:
            for round3 in citation:
                if round3.tag == "PMID":
                    pmid = int(round3.text)
                if round3.tag == "Article":
                    for round4 in round3:
                        if round4.tag == "ArticleTitle":
                            title_a = round4.text
                        if round4.tag == "Abstract":
                            for round5 in round4:
                                if round5.tag == "AbstractText":
                                    text = str(round5.text)
                                    text.replace('(ABSTRACT TRUNCATED AT 250 WORDS)', '').replace('(ABSTRACT TRUNCATED AT 400 WORDS)', '').replace('(ABSTRACT TRUNCATED)', '')
                                    abst_text += (text + " ")
                        if round4.tag == "Journal":
                            for round5 in round4:
                                if round5.tag == "ISSN":
                                    issn = round5.text
                                if round5.tag == "Title":
                                    title_j = round5.text
                                if round5.tag == "JournalIssue":
                                    for round6 in round5:
                                        if round6.tag == "PubDate":
                                            for round7 in round6:
                                                if round7.tag == "Year":
                                                    year = int(round7.text)
                                                if round7.tag == "Month":
                                                    month = round7.text
                                                    try:
                                                        month = int(month)
                                                    except:
                                                        month = int(datetime.datetime.strptime(month, "%b").month)
        if pmid != None:
            pmids.append(pmid)
            if abst_text != "":
                abst_text = abst_text.rstrip(" ").replace("\n", "")
                absts.append(abst_text)
                abst_eng.append(identify_eng_by_fasttext(abst_text))
            else:
                absts.append("")
                abst_eng.append(0)
            issns.append(issn)
            title_as.append(title_a)
            title_js.append(title_j)
            years.append(year)
            months.append(month)

  1%|          | 8/1114 [04:44<10:26:59, 34.01s/it]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ce11ad7c130>>
Traceback (most recent call last):
  File "/opt/pip-env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
100%|██████████| 1114/1114 [21:44:03<00:00, 70.24s/it]   


In [57]:
# 一旦保存する
import csv
output_file = "data/processed/241127_pubbmed_extracted.tsv"

with open(output_file, mode="a", newline="", encoding="utf-8") as file:
    writer = csv.writer(file, delimiter="\t")
    writer.writerow(["PMID", "TITLE", "ABST", "ABST_ENG", "JOURNAL", "ISSN", "PUB_YEAR", "PUB_MONTH"])

In [60]:
for i in tqdm(range(len(pmids))):
    row = [pmids[i], title_as[i], absts[i], abst_eng[i], title_js[i], issns[i], years[i], months[i]]
    with open(output_file, mode="a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file, delimiter="\t")
        writer.writerow(row)

100%|██████████| 33405863/33405863 [51:53<00:00, 10727.85it/s] 


pmid 20029614のように一つの論文に対して複数のAbstが存在する場合には、最終版を使用する。