In [1]:
from Bio import Entrez
import pandas as pd
from tqdm.notebook import tqdm

Entrez.email = "singhmohit.1889@gmail.com"
Entrez.api_key = "1c22910bfa9f49693b451d3dfec15c1dbb09"

def extract_pubmed_id(reference):
    try:
        for article_id in reference['ArticleIdList']:
            if article_id.attributes['IdType'] == 'pubmed':
                return str(article_id)
    except:
        return float('NaN')

def fetch_paper_details(pmid):
    handle = Entrez.efetch(db="pubmed", id=pmid, rettype="xml", retmode="text")
    records = Entrez.read(handle)
    handle.close()
    
    paper_title = pmid
#     paper_title = records['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
    authors = records['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']
    paper_authors = [f"{author['ForeName']} {author['LastName']}" for author in authors]
    journal = records['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['Title']
    year = records['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']   
    return paper_title, paper_authors, journal, year

def fetch_citations(pmid):
    handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed_citedin")
    records = Entrez.read(handle)
    handle.close()
    
    citing_paper_titles = []
    citing_paper_references = []
    details = []
    if records[0]["LinkSetDb"]:
        for link in records[0]["LinkSetDb"][0]["Link"]:
            citing_pmid = link["Id"]
            handle = Entrez.efetch(db="pubmed", id=citing_pmid, rettype="xml", retmode="text")
            records = Entrez.read(handle)
            handle.close()
            
            paper_title = citing_pmid
#             paper_title = records['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
            references = records['PubmedArticle'][0]['PubmedData']['ReferenceList'][0]['Reference']
            reference_list = [extract_pubmed_id(reference) for reference in references]
            details.append((paper_title,reference_list))
    
    return details

In [2]:
df = pd.read_csv("savedrecs imp csv.csv",encoding='latin-1')
PubmedId = df[~df['Pubmed Id'].isna()]['Pubmed Id'].values

In [3]:
len(PubmedId)

97

In [4]:
PubmedId = PubmedId.astype(int)
PubmedId = PubmedId.astype(str)

In [5]:
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to fetch details and citations
def fetch_details_and_citations(pmid):
    try:
        paper_title, paper_authors, journal, year = fetch_paper_details(pmid)
        citing_papers = fetch_citations(pmid)
        return (pmid, paper_title, paper_authors, journal, year, citing_papers)
    except Exception as e:
        return (pmid, None, None, None, None, None)

# Initialize DataFrame
df = pd.DataFrame([], columns=['Paper_Title', 'Paper_Authors', 'Journal', 'Year', 'Citing_Paper_Title', 'Citing_Paper_References'])

# List of PubMed IDs
# PubmedId = PubmedId

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = {executor.submit(fetch_details_and_citations, pmid): pmid for pmid in PubmedId}
    
    for future in tqdm(as_completed(futures), total=len(PubmedId)):
        pmid, paper_title, paper_authors, journal, year, citing_papers = future.result()
        if paper_title and paper_authors and journal and year and citing_papers:
            for title, ref in citing_papers:
                df.loc[len(df)] = [paper_title, paper_authors, journal, year, title, ref]
        else:
            print(f"Failed to fetch details for PMID: {pmid}")

# Save the DataFrame to a CSV file or any other desired format
df.to_csv('outputPMID.csv', index=False)


  3%|██▍                                                                             | 3/97 [00:07<03:10,  2.03s/it]

Failed to fetch details for PMID: 34910933


 24%|██████████████████▋                                                            | 23/97 [05:16<15:20, 12.44s/it]

Failed to fetch details for PMID: 32783919


 38%|██████████████████████████████▏                                                | 37/97 [15:05<30:51, 30.87s/it]

Failed to fetch details for PMID: 20457754


 55%|███████████████████████████████████████████▏                                   | 53/97 [25:20<15:24, 21.01s/it]

Failed to fetch details for PMID: 25233993


 70%|███████████████████████████████████████████████████████▍                       | 68/97 [27:08<02:24,  4.97s/it]

Failed to fetch details for PMID: 7546706


 75%|███████████████████████████████████████████████████████████▍                   | 73/97 [27:24<01:42,  4.28s/it]

Failed to fetch details for PMID: 32998157


100%|███████████████████████████████████████████████████████████████████████████████| 97/97 [42:00<00:00, 25.99s/it]


In [11]:
df

Unnamed: 0,Paper_Title,Paper_Authors,Journal,Year,Citing_Paper_Title,Citing_Paper_References
0,37020354,"[Jennifer Cable, Barney S Graham, Richard A Ko...",Annals of the New York Academy of Sciences,2023,38125026,"[31869338, 26229114, 27153120, 25121750, 21987..."
1,37020354,"[Jennifer Cable, Barney S Graham, Richard A Ko...",Annals of the New York Academy of Sciences,2023,37084166,"[nan, None, 35454874, nan, 35089462, None, 366..."
2,37371829,"[Owen Daly, Azita Josefine Mahiny, Sara Majesk...",Biomedicines,2023,38400169,"[35821637, 37705354, 36311701, 37595605, 37454..."
3,37371829,"[Owen Daly, Azita Josefine Mahiny, Sara Majesk...",Biomedicines,2023,38254150,"[33049211, 33942880, 27498188, 28762175, 34706..."
4,35536311,"[Ronit Nir, Thomas Philipp Hoernes, Hiromi Mur...",Nucleic acids research,2022,38406265,"[8559254, 8674114, 8797828, 9106664, 37492704,..."
...,...,...,...,...,...,...
7200,16111635,"[Katalin Karikó, Michael Buckstein, Houping Ni...",Immunity,2005,17063184,"[7511686, 10200542, 11094420, 12511877, 107196..."
7201,16111635,"[Katalin Karikó, Michael Buckstein, Houping Ni...",Immunity,2005,16638933,"[nan, nan, 9561647, 10227322, 10763707, 115745..."
7202,16111635,"[Katalin Karikó, Michael Buckstein, Houping Ni...",Immunity,2005,16481219,"[75545, 75546, 3855537, 1849259, 10806995, 114..."
7203,16111635,"[Katalin Karikó, Michael Buckstein, Houping Ni...",Immunity,2005,16446382,"[11861602, 15229469, 15207506, 11861616, 15879..."


In [None]:
ids = ['34910933',
'7546706',
'32998157',
'26264835',
'25233993',
'18797453',
'20457754',
'22334017',
'29739835',
'33414215',
'32783919']

In [None]:
# Initialize DataFrame
df = pd.DataFrame([], columns=['Paper_Title', 'Paper_Authors', 'Journal', 'Year', 'Citing_Paper_Title', 'Citing_Paper_References'])

# List of PubMed IDs
PubmedId = ids

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(fetch_details_and_citations, pmid): pmid for pmid in PubmedId}
    
    for future in tqdm(as_completed(futures), total=len(PubmedId)):
        pmid, paper_title, paper_authors, journal, year, citing_papers = future.result()
        if paper_title and paper_authors and journal and year and citing_papers:
            for title, ref in citing_papers:
                df.loc[len(df)] = [paper_title, paper_authors, journal, year, title, ref]
        else:
            print(f"Failed to fetch details for PMID: {pmid}")

# Save the DataFrame to a CSV file or any other desired format
df.to_csv('output1.csv', index=False)

In [None]:
# eval(a[:1]['Citing_Paper_References'][0])[0]

In [None]:
import pandas as pd

In [None]:
pd.read_csv('output1.csv')