In [22]:
from concurrent.futures import ThreadPoolExecutor
from Bio.Entrez import efetch, read
from Bio import Entrez
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import gender_guesser.detector as gender

# pip install biopython
# pip install gender-guesser

d = gender.Detector()
def get_gender(name):
    first_name = name.split()[0]
    gender = d.get_gender(first_name)
    return gender

# enter ur email and pubmed api key
Entrez.email = ...
Entrez.api_key = ...

pmids = pd.read_csv('pmid.txt')['pmid']
    
# 24459728

# pmids = [17139226,
# 23753526,
# 21782045,
# 21626934,
# 23954563,
# 14550805,
# 15187116,
# 12808441,
# 16210359,
# 14581154,
# 22253368,
# 18480896,
# 25291577,
# 21987784,
# 23441200,
# 15207317]

df = pd.DataFrame([], columns=['PMID', 'Title', 'Authors'])

def fetch_abstract(pmid):
    try:
        handle = efetch(db='pubmed', id=pmid, retmode='xml')
        xml_data = read(handle)

        auth = [i['ForeName'] + ' '+ i['LastName'] for i in xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'] if 'ForeName' in i and 'LastName' in i]
        title = xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
        
        return pmid, title, auth
    except Exception as e:
        print(f"pmid-{pmid}",e)
        return None

def process_pmids(pmids):
    results = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(fetch_abstract, pmid) for pmid in pmids]
        for future in tqdm(futures, total=len(futures), desc="Fetching Abstracts"):
            result = future.result()
            if result:
                results.append(result)
    return results

results = process_pmids(pmids)

for result in results:
    df.loc[len(df)] = result

Fetching Abstracts:   0%|          | 0/32671 [00:00<?, ?it/s]

pmid-17139226 HTTP Error 429: Too Many Requests
pmid-23753526 HTTP Error 429: Too Many Requests
pmid-21782045 list index out of range
pmid-21626934 HTTP Error 429: Too Many Requests
pmid-23954563 HTTP Error 429: Too Many Requests
pmid-14550805 Failed to find tag 'pubmed' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False.
pmid-15187116 Failed to find tag 'pubmed' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False.
pmid-12808441 Failed to find tag 'pubmed' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False.
pmid-16210359 HTTP Error 429: Too Many Requests
pmid-14581154 HTTP Error 429: Too Many Requests
pmid-22253368 HTTP Error 429: Too Many Requests
pmid-18480896 HTTP Error 429: Too Many Requests
pmid-25291577 HTTP Error 429: Too Many Requests

In [40]:
df['Gender'] = df['Authors'].progress_apply(lambda author_names: [get_gender(name) for name in author_names])

  0%|          | 0/32670 [00:00<?, ?it/s]

In [47]:
df['male'] = df['Gender'].progress_apply(lambda x: x.count('male'))
df['female'] = df['Gender'].progress_apply(lambda x: x.count('female'))
df['unknown'] = df['Gender'].progress_apply(lambda x: x.count('unknown'))

  0%|          | 0/32670 [00:00<?, ?it/s]

  0%|          | 0/32670 [00:00<?, ?it/s]

  0%|          | 0/32670 [00:00<?, ?it/s]

In [50]:
df

Unnamed: 0,PMID,Title,Authors,Gender,male,female,unknown
0,19456161,"Macrocyclic design strategies for small, stabl...","[Felix Freire, Samuel H Gellman]","[male, male]",2,0,0
1,19580264,An alpha/beta-peptide helix bundle with a pure...,"[Michael W Giuliano, W Seth Horne, Samuel H Ge...","[male, unknown, male]",2,0,1
2,19644993,In situ monitoring of backbone thioester excha...,"[William C Pomerantz, Erik B Hadley, Charles G...","[male, male, male, male]",4,0,0
3,19967682,Detection and analysis of chimeric tertiary st...,"[Joshua L Price, Erik B Hadley, Jay D Steinkru...","[male, male, male, male]",4,0,0
4,20465308,Side-chain pairing preferences in the parallel...,"[Jay D Steinkruger, Derek N Woolfson, Samuel H...","[male, male, male]",3,0,0
...,...,...,...,...,...,...,...
32665,18480896,Determination of glucose using a coupled-enzym...,"[Hisham S M Abd-Rabboh, Mark E Meyerhoff]","[male, male]",2,0,0
32666,25291577,Serological responses to an avian influenza A/...,"[Mark J Mulligan, David I Bernstein, Patricia ...","[male, male, female, male, male, female, femal...",6,7,1
32667,21987784,Hydroxyurea induces de novo copy number varian...,"[Martin F Arlt, Alev Cagla Ozdemir, Shanda R B...","[male, female, female, male, male]",3,2,0
32668,23441200,Diaphanous homolog 3 (Diap3) overexpression ca...,"[Cynthia J Schoen, Margit Burmeister, Marci M ...","[female, female, female]",0,3,0


In [51]:
df.to_csv('pdmid.csv',index=False)

In [52]:
df

Unnamed: 0,PMID,Title,Authors,Gender,male,female,unknown
0,19456161,"Macrocyclic design strategies for small, stabl...","[Felix Freire, Samuel H Gellman]","[male, male]",2,0,0
1,19580264,An alpha/beta-peptide helix bundle with a pure...,"[Michael W Giuliano, W Seth Horne, Samuel H Ge...","[male, unknown, male]",2,0,1
2,19644993,In situ monitoring of backbone thioester excha...,"[William C Pomerantz, Erik B Hadley, Charles G...","[male, male, male, male]",4,0,0
3,19967682,Detection and analysis of chimeric tertiary st...,"[Joshua L Price, Erik B Hadley, Jay D Steinkru...","[male, male, male, male]",4,0,0
4,20465308,Side-chain pairing preferences in the parallel...,"[Jay D Steinkruger, Derek N Woolfson, Samuel H...","[male, male, male]",3,0,0
...,...,...,...,...,...,...,...
32665,18480896,Determination of glucose using a coupled-enzym...,"[Hisham S M Abd-Rabboh, Mark E Meyerhoff]","[male, male]",2,0,0
32666,25291577,Serological responses to an avian influenza A/...,"[Mark J Mulligan, David I Bernstein, Patricia ...","[male, male, female, male, male, female, femal...",6,7,1
32667,21987784,Hydroxyurea induces de novo copy number varian...,"[Martin F Arlt, Alev Cagla Ozdemir, Shanda R B...","[male, female, female, male, male]",3,2,0
32668,23441200,Diaphanous homolog 3 (Diap3) overexpression ca...,"[Cynthia J Schoen, Margit Burmeister, Marci M ...","[female, female, female]",0,3,0
