<a href="https://colab.research.google.com/github/angirov/pubmed_crawler/blob/main/save_bydate_saver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python3 --version
!pip install Bio

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [32]:
import os

dir = "/content/drive/MyDrive/dsr/pubmed_data/"
os.chdir(dir)
os.getcwd()

'/content/drive/MyDrive/dsr/pubmed_data'

In [33]:
import os
from tqdm import tqdm
import pandas as pd
from Bio import Entrez
Entrez.email = "example@example.com"
from urllib.error import HTTPError
from pathlib import Path

In [37]:
def get_pmids_from_one_day(topic, date):
    handle = Entrez.esearch(db="pubmed", 
                            term= topic + "[MeSH Terms]", 
                            retmax=10000,
                            datetype="pdat", 
                            mindate=date, 
                            maxdate=date, )
    record = Entrez.read(handle)
    ids = record["IdList"]
    return ids


class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
# print(color.BOLD + 'Hello, World!' + color.END)


def print_err(text):
    err_text = color.BOLD + color.RED + text + color.END
    print(err_text)
    return err_text


def get_pmids_from_period(topic, span):
    handle = Entrez.esearch(db="pubmed", 
                            term= topic + "[MeSH Terms]",
                            retmax=1000,
                            datetype="pdat",
                            mindate=span[0],
                            maxdate=span[1], )
    record = Entrez.read(handle)
    ids = record["IdList"]
    return ids


import time
def get_xml(id):
    attempt = 1
    while attempt <= 5:
        try:
            handle = Entrez.efetch(db="pubmed", 
                            id=id, 
                            retmode="xml", 
                            rettype="abstract")
            xml = handle.read()
            return xml
        except HTTPError as err:
            if 500 <= err.code <= 599:
                print_err(f">>>>> Attempt {attempt}: Received error from server {err}")
                attempt += 1
                time.sleep(15)
            else:
                attempt += 1
                time.sleep(15)
    print_err(f">>>>> Failed to download {id}. Error: {err}")
    return ""

    # pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(json.dumps(xmltodict.parse(xml)))


def get_xmls_ids(ids):
    pairs = []
    for id in tqdm(ids):
        xml = get_xml(id)
        pairs += [(id, xml)]
    return pairs


def save_pair(csv_writer, id, xml):
    row = id, xml
    csv_writer.writerow(row)
    # print(f"Saved paper {id}.")


def save_xmls_ids(file_name, date, ids):
    with open(file_name,'w+') as out:
        csv_writer=csv.writer(out, dialect='unix')
        for id in tqdm(ids):
            xml = get_xml(id)
            save_pair(csv_writer, id, xml)
    print(f"Saved {len(ids)} papers published on {date}.")
    return file_name


import csv
def save_span(span, list_of_pairs):
    file_name = span[0].replace("/", "_") + "__" + span[1].replace("/", "_") + ".csv"
    with open(file_name,'w+') as out:
        csv_out=csv.writer(out, dialect='unix')
        # csv_out.writerow(['name','num'])
        # for row in list_of_pairs:
        #     csv_out.writerow(row)
        csv_out.writerows(list_of_pairs)
    return file_name


def save_date(date, list_of_pairs):
    file_name = date2filename(date)
    with open(file_name,'w+') as out:
        csv_out=csv.writer(out, dialect='unix')
        # csv_out.writerow(['name','num'])
        # for row in list_of_pairs:
        #     csv_out.writerow(row)
        csv_out.writerows(list_of_pairs)
        # print(f"Saved {len(list_of_pairs)} papers published on {date}.")
    return file_name


def ymd2date(year, month, day):
    return f'{year}/{month:02}/{day:02}'


def date2ymd(date):
    year, month, day = str.split(date, sep="/")
    return year, month, day


def date2filename(dir, date):
    year, month, _ = date2ymd(date)
    filename = year + "/" + month + "/" + date.replace("/", "_") + ".csv"
    print(f"File name: {filename}")
    return filename


In [35]:
year = "2022"
month = '09'


for day in range(1, 32):
    date = ymd2date(year, month, day)

    ids = get_pmids_from_one_day("breast cancer", date)
    print(f"Number of papers published on  {date}': {len(ids)}")
    if len(ids) == 0:
        break

    file_name = Path(date2filename(dir, date))
    path = Path(dir + "/" + year + "/" + month + "/")
    path.mkdir(parents=True, exist_ok=True)

    if file_name.is_file():
        df = pd.read_csv(file_name, header=None, names=['id', 'xml'])
        ids_done = df.id.to_list()
        ids = [id for id in ids if id not in ids_done]
        print(f"Number of papers already saved': {len(ids_done)}. To save: {len(ids)}")

    save_xmls_ids(file_name, date, ids)


Number of papers published on  2022/09/01': 699
File name: 2022/09/2022_09_01.csv


100%|██████████| 699/699 [06:31<00:00,  1.79it/s]


Saved 699 papers published on 2022/09/01.
Number of papers published on  2022/09/02': 47
File name: 2022/09/2022_09_02.csv


100%|██████████| 47/47 [00:39<00:00,  1.19it/s]


Saved 47 papers published on 2022/09/02.
Number of papers published on  2022/09/03': 14
File name: 2022/09/2022_09_03.csv


100%|██████████| 14/14 [00:05<00:00,  2.75it/s]


Saved 14 papers published on 2022/09/03.
Number of papers published on  2022/09/04': 5
File name: 2022/09/2022_09_04.csv


100%|██████████| 5/5 [00:01<00:00,  2.85it/s]


Saved 5 papers published on 2022/09/04.
Number of papers published on  2022/09/05': 34
File name: 2022/09/2022_09_05.csv


100%|██████████| 34/34 [00:12<00:00,  2.72it/s]


Saved 34 papers published on 2022/09/05.
Number of papers published on  2022/09/06': 43
File name: 2022/09/2022_09_06.csv


100%|██████████| 43/43 [00:15<00:00,  2.72it/s]


Saved 43 papers published on 2022/09/06.
Number of papers published on  2022/09/07': 39
File name: 2022/09/2022_09_07.csv


100%|██████████| 39/39 [00:17<00:00,  2.21it/s]


Saved 39 papers published on 2022/09/07.
Number of papers published on  2022/09/08': 33
File name: 2022/09/2022_09_08.csv


100%|██████████| 33/33 [00:12<00:00,  2.74it/s]


Saved 33 papers published on 2022/09/08.
Number of papers published on  2022/09/09': 32
File name: 2022/09/2022_09_09.csv


100%|██████████| 32/32 [00:11<00:00,  2.73it/s]


Saved 32 papers published on 2022/09/09.
Number of papers published on  2022/09/10': 27
File name: 2022/09/2022_09_10.csv


100%|██████████| 27/27 [00:25<00:00,  1.08it/s]


Saved 27 papers published on 2022/09/10.
Number of papers published on  2022/09/11': 7
File name: 2022/09/2022_09_11.csv


100%|██████████| 7/7 [00:02<00:00,  2.85it/s]


Saved 7 papers published on 2022/09/11.
Number of papers published on  2022/09/12': 36
File name: 2022/09/2022_09_12.csv


100%|██████████| 36/36 [00:28<00:00,  1.26it/s]


Saved 36 papers published on 2022/09/12.
Number of papers published on  2022/09/13': 28
File name: 2022/09/2022_09_13.csv


100%|██████████| 28/28 [00:10<00:00,  2.72it/s]


Saved 28 papers published on 2022/09/13.
Number of papers published on  2022/09/14': 28
File name: 2022/09/2022_09_14.csv


100%|██████████| 28/28 [00:10<00:00,  2.75it/s]


Saved 28 papers published on 2022/09/14.
Number of papers published on  2022/09/15': 54
File name: 2022/09/2022_09_15.csv


100%|██████████| 54/54 [00:42<00:00,  1.29it/s]


Saved 54 papers published on 2022/09/15.
Number of papers published on  2022/09/16': 32
File name: 2022/09/2022_09_16.csv


100%|██████████| 32/32 [00:17<00:00,  1.85it/s]


Saved 32 papers published on 2022/09/16.
Number of papers published on  2022/09/17': 21
File name: 2022/09/2022_09_17.csv


100%|██████████| 21/21 [00:07<00:00,  2.80it/s]


Saved 21 papers published on 2022/09/17.
Number of papers published on  2022/09/18': 9
File name: 2022/09/2022_09_18.csv


100%|██████████| 9/9 [00:03<00:00,  2.39it/s]


Saved 9 papers published on 2022/09/18.
Number of papers published on  2022/09/19': 27
File name: 2022/09/2022_09_19.csv


100%|██████████| 27/27 [00:25<00:00,  1.08it/s]


Saved 27 papers published on 2022/09/19.
Number of papers published on  2022/09/20': 49
File name: 2022/09/2022_09_20.csv


100%|██████████| 49/49 [00:45<00:00,  1.08it/s]


Saved 49 papers published on 2022/09/20.
Number of papers published on  2022/09/21': 34
File name: 2022/09/2022_09_21.csv


100%|██████████| 34/34 [00:29<00:00,  1.16it/s]


Saved 34 papers published on 2022/09/21.
Number of papers published on  2022/09/22': 32
File name: 2022/09/2022_09_22.csv


100%|██████████| 32/32 [00:11<00:00,  2.74it/s]


Saved 32 papers published on 2022/09/22.
Number of papers published on  2022/09/23': 25
File name: 2022/09/2022_09_23.csv


100%|██████████| 25/25 [00:09<00:00,  2.73it/s]


Saved 25 papers published on 2022/09/23.
Number of papers published on  2022/09/24': 28
File name: 2022/09/2022_09_24.csv


100%|██████████| 28/28 [00:10<00:00,  2.69it/s]


Saved 28 papers published on 2022/09/24.
Number of papers published on  2022/09/25': 15
File name: 2022/09/2022_09_25.csv


100%|██████████| 15/15 [00:06<00:00,  2.18it/s]


Saved 15 papers published on 2022/09/25.
Number of papers published on  2022/09/26': 38
File name: 2022/09/2022_09_26.csv


100%|██████████| 38/38 [00:27<00:00,  1.38it/s]


Saved 38 papers published on 2022/09/26.
Number of papers published on  2022/09/27': 31
File name: 2022/09/2022_09_27.csv


100%|██████████| 31/31 [00:18<00:00,  1.70it/s]


Saved 31 papers published on 2022/09/27.
Number of papers published on  2022/09/28': 52
File name: 2022/09/2022_09_28.csv


100%|██████████| 52/52 [01:17<00:00,  1.49s/it]


Saved 52 papers published on 2022/09/28.
Number of papers published on  2022/09/29': 46
File name: 2022/09/2022_09_29.csv


100%|██████████| 46/46 [00:32<00:00,  1.41it/s]


Saved 46 papers published on 2022/09/29.
Number of papers published on  2022/09/30': 31
File name: 2022/09/2022_09_30.csv


100%|██████████| 31/31 [00:12<00:00,  2.51it/s]


Saved 31 papers published on 2022/09/30.
Number of papers published on  2022/09/31': 0


In [None]:
year = 2022

for month in range(10, 13):
    for day in range(1, 32):
        date = ymd2date(year, month, day)

        ids = get_pmids_from_one_day("breast cancer", date)
        print(f"Number of papers published on  {date}': {len(ids)}")
        if len(ids) == 0:
            break

        file_name = Path(date2filename(dir, date))
        path = Path(dir + "/" + f'{year}' + "/" + f"{month:02}" + "/")
        path.mkdir(parents=True, exist_ok=True)

        if file_name.is_file():
            df = pd.read_csv(file_name, header=None, names=['id', 'xml'])
            ids_done = df.id.to_list()
            ids = [id for id in ids if id not in ids_done]
            print(f"Number of papers already saved': {len(ids_done)}. To save: {len(ids)}")

        save_xmls_ids(file_name, date, ids)


Number of papers published on  2022/10/01': 756
File name: 2022/10/2022_10_01.csv


100%|██████████| 756/756 [06:23<00:00,  1.97it/s]


Saved 756 papers published on 2022/10/01.
Number of papers published on  2022/10/02': 7
File name: 2022/10/2022_10_02.csv


100%|██████████| 7/7 [00:02<00:00,  2.77it/s]


Saved 7 papers published on 2022/10/02.
Number of papers published on  2022/10/03': 40
File name: 2022/10/2022_10_03.csv


100%|██████████| 40/40 [00:31<00:00,  1.27it/s]


Saved 40 papers published on 2022/10/03.
Number of papers published on  2022/10/04': 44
File name: 2022/10/2022_10_04.csv


100%|██████████| 44/44 [00:17<00:00,  2.52it/s]


Saved 44 papers published on 2022/10/04.
Number of papers published on  2022/10/05': 41
File name: 2022/10/2022_10_05.csv


100%|██████████| 41/41 [00:14<00:00,  2.74it/s]


Saved 41 papers published on 2022/10/05.
Number of papers published on  2022/10/06': 42
File name: 2022/10/2022_10_06.csv


100%|██████████| 42/42 [00:15<00:00,  2.70it/s]


Saved 42 papers published on 2022/10/06.
Number of papers published on  2022/10/07': 32
File name: 2022/10/2022_10_07.csv


100%|██████████| 32/32 [00:13<00:00,  2.34it/s]


Saved 32 papers published on 2022/10/07.
Number of papers published on  2022/10/08': 20
File name: 2022/10/2022_10_08.csv


100%|██████████| 20/20 [00:07<00:00,  2.59it/s]


Saved 20 papers published on 2022/10/08.
Number of papers published on  2022/10/09': 5
File name: 2022/10/2022_10_09.csv


100%|██████████| 5/5 [00:01<00:00,  3.15it/s]


Saved 5 papers published on 2022/10/09.
Number of papers published on  2022/10/10': 35
File name: 2022/10/2022_10_10.csv


100%|██████████| 35/35 [00:27<00:00,  1.25it/s]


Saved 35 papers published on 2022/10/10.
Number of papers published on  2022/10/11': 21
File name: 2022/10/2022_10_11.csv


100%|██████████| 21/21 [00:08<00:00,  2.55it/s]


Saved 21 papers published on 2022/10/11.
Number of papers published on  2022/10/12': 37
File name: 2022/10/2022_10_12.csv


100%|██████████| 37/37 [00:13<00:00,  2.67it/s]


Saved 37 papers published on 2022/10/12.
Number of papers published on  2022/10/13': 42
File name: 2022/10/2022_10_13.csv


100%|██████████| 42/42 [00:18<00:00,  2.30it/s]


Saved 42 papers published on 2022/10/13.
Number of papers published on  2022/10/14': 34
File name: 2022/10/2022_10_14.csv


100%|██████████| 34/34 [00:12<00:00,  2.76it/s]


Saved 34 papers published on 2022/10/14.
Number of papers published on  2022/10/15': 23
File name: 2022/10/2022_10_15.csv


100%|██████████| 23/23 [00:08<00:00,  2.79it/s]


Saved 23 papers published on 2022/10/15.
Number of papers published on  2022/10/16': 5
File name: 2022/10/2022_10_16.csv


100%|██████████| 5/5 [00:01<00:00,  3.03it/s]


Saved 5 papers published on 2022/10/16.
Number of papers published on  2022/10/17': 58
File name: 2022/10/2022_10_17.csv


100%|██████████| 58/58 [00:36<00:00,  1.57it/s]


Saved 58 papers published on 2022/10/17.
Number of papers published on  2022/10/18': 43
File name: 2022/10/2022_10_18.csv


100%|██████████| 43/43 [00:15<00:00,  2.72it/s]


Saved 43 papers published on 2022/10/18.
Number of papers published on  2022/10/19': 31
File name: 2022/10/2022_10_19.csv


100%|██████████| 31/31 [00:11<00:00,  2.73it/s]


Saved 31 papers published on 2022/10/19.
Number of papers published on  2022/10/20': 43
File name: 2022/10/2022_10_20.csv


100%|██████████| 43/43 [00:16<00:00,  2.68it/s]


Saved 43 papers published on 2022/10/20.
Number of papers published on  2022/10/21': 33
File name: 2022/10/2022_10_21.csv


100%|██████████| 33/33 [00:12<00:00,  2.69it/s]


Saved 33 papers published on 2022/10/21.
Number of papers published on  2022/10/22': 16
File name: 2022/10/2022_10_22.csv


100%|██████████| 16/16 [00:06<00:00,  2.55it/s]


Saved 16 papers published on 2022/10/22.
Number of papers published on  2022/10/23': 9
File name: 2022/10/2022_10_23.csv


100%|██████████| 9/9 [00:03<00:00,  2.78it/s]


Saved 9 papers published on 2022/10/23.
Number of papers published on  2022/10/24': 18
File name: 2022/10/2022_10_24.csv


100%|██████████| 18/18 [00:16<00:00,  1.10it/s]


Saved 18 papers published on 2022/10/24.
Number of papers published on  2022/10/25': 37
File name: 2022/10/2022_10_25.csv


100%|██████████| 37/37 [00:13<00:00,  2.72it/s]


Saved 37 papers published on 2022/10/25.
Number of papers published on  2022/10/26': 31
File name: 2022/10/2022_10_26.csv


100%|██████████| 31/31 [00:11<00:00,  2.72it/s]


Saved 31 papers published on 2022/10/26.
Number of papers published on  2022/10/27': 32
File name: 2022/10/2022_10_27.csv


100%|██████████| 32/32 [00:11<00:00,  2.77it/s]


Saved 32 papers published on 2022/10/27.
Number of papers published on  2022/10/28': 35
File name: 2022/10/2022_10_28.csv


100%|██████████| 35/35 [00:12<00:00,  2.72it/s]


Saved 35 papers published on 2022/10/28.
Number of papers published on  2022/10/29': 18
File name: 2022/10/2022_10_29.csv


100%|██████████| 18/18 [00:06<00:00,  2.74it/s]


Saved 18 papers published on 2022/10/29.
Number of papers published on  2022/10/30': 8
File name: 2022/10/2022_10_30.csv


100%|██████████| 8/8 [00:02<00:00,  2.80it/s]


Saved 8 papers published on 2022/10/30.
Number of papers published on  2022/10/31': 22
File name: 2022/10/2022_10_31.csv


100%|██████████| 22/22 [00:08<00:00,  2.74it/s]


Saved 22 papers published on 2022/10/31.
Number of papers published on  2022/11/01': 703
File name: 2022/11/2022_11_01.csv


100%|██████████| 703/703 [08:43<00:00,  1.34it/s]


Saved 703 papers published on 2022/11/01.
Number of papers published on  2022/11/02': 46
File name: 2022/11/2022_11_02.csv


100%|██████████| 46/46 [00:35<00:00,  1.31it/s]


Saved 46 papers published on 2022/11/02.
Number of papers published on  2022/11/03': 40
File name: 2022/11/2022_11_03.csv


100%|██████████| 40/40 [00:33<00:00,  1.20it/s]


Saved 40 papers published on 2022/11/03.
Number of papers published on  2022/11/04': 21
File name: 2022/11/2022_11_04.csv


100%|██████████| 21/21 [00:25<00:00,  1.19s/it]


Saved 21 papers published on 2022/11/04.
Number of papers published on  2022/11/05': 24
File name: 2022/11/2022_11_05.csv


100%|██████████| 24/24 [00:30<00:00,  1.26s/it]


Saved 24 papers published on 2022/11/05.
Number of papers published on  2022/11/06': 9
File name: 2022/11/2022_11_06.csv


100%|██████████| 9/9 [00:05<00:00,  1.80it/s]


Saved 9 papers published on 2022/11/06.
Number of papers published on  2022/11/07': 35
File name: 2022/11/2022_11_07.csv


100%|██████████| 35/35 [00:28<00:00,  1.21it/s]


Saved 35 papers published on 2022/11/07.
Number of papers published on  2022/11/08': 32
File name: 2022/11/2022_11_08.csv


100%|██████████| 32/32 [00:11<00:00,  2.67it/s]


Saved 32 papers published on 2022/11/08.
Number of papers published on  2022/11/09': 36
File name: 2022/11/2022_11_09.csv


100%|██████████| 36/36 [00:31<00:00,  1.15it/s]


Saved 36 papers published on 2022/11/09.
Number of papers published on  2022/11/10': 26
File name: 2022/11/2022_11_10.csv


100%|██████████| 26/26 [00:12<00:00,  2.03it/s]


Saved 26 papers published on 2022/11/10.
Number of papers published on  2022/11/11': 36
File name: 2022/11/2022_11_11.csv


100%|██████████| 36/36 [00:33<00:00,  1.08it/s]


Saved 36 papers published on 2022/11/11.
Number of papers published on  2022/11/12': 21
File name: 2022/11/2022_11_12.csv


100%|██████████| 21/21 [00:09<00:00,  2.23it/s]


Saved 21 papers published on 2022/11/12.
Number of papers published on  2022/11/13': 13
File name: 2022/11/2022_11_13.csv


100%|██████████| 13/13 [00:06<00:00,  2.00it/s]


Saved 13 papers published on 2022/11/13.
Number of papers published on  2022/11/14': 35
File name: 2022/11/2022_11_14.csv


100%|██████████| 35/35 [00:13<00:00,  2.68it/s]


Saved 35 papers published on 2022/11/14.
Number of papers published on  2022/11/15': 49
File name: 2022/11/2022_11_15.csv


100%|██████████| 49/49 [00:20<00:00,  2.45it/s]


Saved 49 papers published on 2022/11/15.
Number of papers published on  2022/11/16': 30
File name: 2022/11/2022_11_16.csv


100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Saved 30 papers published on 2022/11/16.
Number of papers published on  2022/11/17': 37
File name: 2022/11/2022_11_17.csv


100%|██████████| 37/37 [00:36<00:00,  1.01it/s]


Saved 37 papers published on 2022/11/17.
Number of papers published on  2022/11/18': 27
File name: 2022/11/2022_11_18.csv


100%|██████████| 27/27 [00:29<00:00,  1.09s/it]


Saved 27 papers published on 2022/11/18.
Number of papers published on  2022/11/19': 21
File name: 2022/11/2022_11_19.csv


100%|██████████| 21/21 [00:07<00:00,  2.66it/s]


Saved 21 papers published on 2022/11/19.
Number of papers published on  2022/11/20': 15
File name: 2022/11/2022_11_20.csv


100%|██████████| 15/15 [00:20<00:00,  1.37s/it]


Saved 15 papers published on 2022/11/20.
Number of papers published on  2022/11/21': 36
File name: 2022/11/2022_11_21.csv


100%|██████████| 36/36 [00:28<00:00,  1.26it/s]


Saved 36 papers published on 2022/11/21.
Number of papers published on  2022/11/22': 36
File name: 2022/11/2022_11_22.csv


100%|██████████| 36/36 [00:29<00:00,  1.24it/s]


Saved 36 papers published on 2022/11/22.
Number of papers published on  2022/11/23': 33
File name: 2022/11/2022_11_23.csv


100%|██████████| 33/33 [00:21<00:00,  1.55it/s]


Saved 33 papers published on 2022/11/23.
Number of papers published on  2022/11/24': 25
File name: 2022/11/2022_11_24.csv


100%|██████████| 25/25 [00:25<00:00,  1.00s/it]


Saved 25 papers published on 2022/11/24.
Number of papers published on  2022/11/25': 28
File name: 2022/11/2022_11_25.csv


100%|██████████| 28/28 [00:26<00:00,  1.07it/s]


Saved 28 papers published on 2022/11/25.
Number of papers published on  2022/11/26': 15
File name: 2022/11/2022_11_26.csv


100%|██████████| 15/15 [00:37<00:00,  2.49s/it]


Saved 15 papers published on 2022/11/26.
Number of papers published on  2022/11/27': 8
File name: 2022/11/2022_11_27.csv


100%|██████████| 8/8 [00:18<00:00,  2.32s/it]


Saved 8 papers published on 2022/11/27.
Number of papers published on  2022/11/28': 35
File name: 2022/11/2022_11_28.csv


 40%|████      | 14/35 [00:05<00:07,  2.75it/s]

In [None]:
# from pathlib import Path

# year = "2022"
# month = '04'
# day = '01'

# date = ymd2date(year, month, day)
# file_name = Path(date2filename)
# if file_name.is_file():
#     df = pd.read_csv(file_name, header=None, names=['id', 'xml'])
#     ids_done = df.id.to_list()
#     ids = 
