In [1]:
import argparse
from Bio import Entrez

import os 
import pandas as pd
from datetime import date
from datetime import datetime
import numpy as np
import re
from metapub import FindIt
from metapub.convert import pmid2doi
from metapub.convert import doi2pmid
from metapub import PubMedFetcher

import ssl
ssl._create_default_https_context = ssl._create_unverified_context



## Query the NCBI database

In [2]:
# use one of the following search queries/filters
filters = "HiChIP[All Fields] AND \"gse\"[Filter]" # the whole notebook will take around 7 min to run
# filters = "HiChIP[All Fields] AND (\"gse\"[Filter] AND (\"Homo sapiens\"[Organism] OR \"Mus musculus\"[Organism]))
# filters = "HiChIP[All Fields] AND (\"Homo sapiens\"[Organism] AND \"gse\"[Filter])"
# filters = "HiChIP[All Fields] AND (\"Homo sapiens\"[Organism] AND \"published last year\"[Filter]) AND \"gse\"[Filter]" # 2 min

# set a dummy email 
Entrez.email = "zjiang@lji.org"

# query the NCBI database 
search_result = Entrez.esearch(db="gds", retmax=10000, term=filters)
result = Entrez.read(search_result)

# creating a regex to extract pubmed IDs
PubMedIds_pattern = re.compile('[0123456789]+')

In [3]:
# parse the results into a dataframe 
dictionary_lst = []
for Id in result["IdList"]:
    handle = Entrez.esummary(db="gds", id=Id, retmode="xml") # get summary of this entry on GEO datasets
    entry = Entrez.parse(handle)
    for column in entry:
        dictionary_lst.append(column)
df = pd.DataFrame(dictionary_lst)

In [4]:
df.head()

Unnamed: 0,Item,Id,Accession,GDS,title,summary,GPL,GSE,taxon,entryType,...,ExtRelations,n_samples,SeriesTitle,PlatformTitle,PlatformTaxa,SamplesTaxa,PubMedIds,Projects,FTPLink,GEO2R
0,[],200180198,GSE180198,,EWSR1-ATF1 dependent 3D connectivity regulates...,This SuperSeries is composed of the SubSeries ...,20301,180198,Homo sapiens,GSE,...,[],147,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE180nn...,no
1,[],200180194,GSE180194,,EWSR1-ATF1 dependent 3D connectivity regulates...,Chimeric proteins resulting from chromosomal t...,20301,180194,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP3...",4,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE180nn...,no
2,[],200173872,GSE173872,,EBF1 nuclear repositioning instructs chromatin...,This SuperSeries is composed of the SubSeries ...,24676;18573,173872,Homo sapiens,GSE,...,[],233,,,,,"[IntegerElement(35182476, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE173nn...,yes
3,[],200173871,GSE173871,,EBF1 nuclear repositioning instructs chromatin...,Purpose: To investigate the mechanisms of 3D g...,18573,173871,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP3...",4,,,,,"[IntegerElement(35182476, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE173nn...,yes
4,[],200173843,GSE173843,,EBF1 nuclear repositioning instructs chromatin...,Purpose: To investigate the mechanisms of 3D g...,18573,173843,Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP3...",2,,,,,"[IntegerElement(35182476, attributes={})]",[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE173nn...,yes


## Cleaning the Paper Names and Converting from PMID to DOI

In [5]:
dictionary_lst = []
for Id in result["IdList"]:
    
    handle = Entrez.esummary(db="gds", id=Id, retmode="xml")
    entry = Entrez.parse(handle)
    
    for column in entry:
        paper_title = column['title']
        new_paper_title = ''
        last_word = re.split('\s|[.]', paper_title)[-1]
        last_two_words = re.split('\s|[.]', paper_title)[-2:]
        
        # remove ".[HiChIP]", ". [HiChIP]", or "(HiChIP)" in paper titles using steps below
        if ('[' in last_word and ']' in last_word) or \
                ('(' in last_word and ')' in last_word):
            new_paper_title = re.split('\s|[.]', paper_title)[0:-1]
            new_paper_title = ' '.join(new_paper_title)
            new_paper_title = new_paper_title.strip()
            if new_paper_title.endswith('.'): # remove the last period in paper title
                new_paper_title = new_paper_title[:-1]
                
        # remove ".[Hi ChIP]", ". [Hi ChIP]", or "[Bead Array]" in paper titles using steps below
        elif ('[' in last_two_words[0] and ']' in last_two_words[1]) or \
                    ('(' in last_two_words[0] and ')' in last_two_words[1]):
            new_paper_title = re.split('\s|[.]', paper_title)[0:-2]
            new_paper_title = ' '.join(new_paper_title)
            new_paper_title = new_paper_title.strip()
            if new_paper_title.endswith('.'): # remove the last period in paper title
                new_paper_title = new_paper_title[:-1]
                
        elif paper_title.endswith('.'):
            new_paper_title = paper_title[:-1] 
            
        else:
            new_paper_title = paper_title
            
        new_paper_title = new_paper_title.replace("\xa0", " ") # remove no-break space
        column['title'] = new_paper_title
        
        # convert PMID to DOI, some have no PMID so empty string
        string = str(column['PubMedIds'])
        pdf_url = ""
        PubMedIds = PubMedIds_pattern.findall(string)
        
        if len(PubMedIds) > 0:
            DOI = pmid2doi(PubMedIds[0])
            column['PubMedIds'] = DOI
        else: column['PubMedIds'] = ""
        dictionary_lst.append(column)
        
df = pd.DataFrame(dictionary_lst)

## Reformat the Columns for Google Sheet Compatibility

In [6]:
df = pd.DataFrame(dictionary_lst)

# drop extra columns
drop_cols = ['Item', 'Id', 'GDS', 'GPL', 'GSE', 'entryType', 'ptechType',
             'valType', 'SSInfo', 'subsetInfo', 'suppFile', 'Relations', 'ExtRelations',
             'n_samples', 'SeriesTitle', 'PlatformTitle', 'PlatformTaxa', 'SamplesTaxa',
             'Projects', 'FTPLink', 'GEO2R']
df.drop(drop_cols, inplace=True, axis=1)

# rename columns
rename_cols = {"Accession":"GEO / Data link", "PubMedIds":"DOI", "title":"Paper Title",
               "taxon":"Organism", "gdsType":"Any other information", "PDAT":"Year",
               "Samples":"Other matched data"}
df.rename(columns=rename_cols, inplace=True)

# extract just the year
df["Year"] = df["Year"].str[:4]

# add index for merging
df['index'] = np.arange(len(df))

# fill na with empty values
df.fillna("",inplace=True)

# adding missing columns with empty values
df["Journal"] = ""
df["Authors"] = ""
df["Tissue/Cell Line"] = ""
df["Presenter"] = ""
df["Potential HiChIP"] = ""

## Assigning Potential HiChIP Samples

Filter out samples with no HiChIP mention and assigned Yes or Maybe to all others 

In [7]:
col = df["Other matched data"]
lst = []
index = 0
for row in col:
    
    temp = []
    state = False
    
    # Check for the term HiChIP in the title
    for ele in row: 
        if ("HiChIP".casefold() in ele["Title"].casefold()) or \
                ("Hi-ChIP".casefold() in ele["Title"].casefold()):
            state = True
    
    # keep samples with HiChIP and mark "Yes"
    if state == True: 
            for ele in row:
                if ("HiChIP".casefold() in ele["Title"].casefold()) or \
                        ("Hi-ChIP".casefold() in ele["Title"].casefold()):
                    temp.append(ele["Accession"] + ": " + ele["Title"])
            temp_str = "\n".join(temp)
            df.at[index, "Potential HiChIP"] = "Yes"

    # otherwise, keep all GSM samples and mark "Maybe"
    else: 
        for ele in row:
            temp.append(ele["Accession"] + ": " + ele["Title"])
        temp_str = "\n".join(temp)
        df.at[index,"Potential HiChIP"]="Maybe"
    lst.append(temp_str)
    index += 1
    
df["Other matched data"] = list(lst)

## Merge Rows with the Same Paper Title

In [8]:
aggregation_functions = {'Paper Title':'first', 'DOI':'max', 'Journal':'first', 'Authors':'first', 'Year':'first', 'GEO / Data link':lambda x: '\n'.join(x),
       'Any other information':lambda x: '\n'.join(x), 'Organism':'first', 'Tissue/Cell Line':'first',
       'Potential HiChIP':'first', 'Other matched data':lambda x: '\n'.join(x), 'Presenter':'first'}
df_grouped = df.groupby(df['Paper Title']).aggregate(aggregation_functions)

## Fetch Journal Name and First Author of Each Paper and Add https to DOI

In [9]:
jounrnal_lst = []
authors_lst = []
fetch = PubMedFetcher()
for DOI in df_grouped['DOI']:
    try:
        PMID = doi2pmid(DOI)
        article = fetch.article_by_pmid(PMID)
        jounrnal_lst.append(article.journal)
        authors_lst.append(article.authors[0].split()[0] + " et al.")
    except:
        jounrnal_lst.append("")
        authors_lst.append("")
df_grouped['Journal'] = jounrnal_lst
df_grouped['Authors'] = authors_lst

In [10]:
# add https address to DOI (for easy access)
df_grouped.loc[(df_grouped['DOI'] != ''), 'DOI'] = 'https://doi.org/' + df_grouped.loc[(df_grouped['DOI'] != ''), 'DOI']

## Note the Date Added, Set Output Path, and Save the Final Output File

In [11]:
# make an output directory
outdir = 'results/hichip_db/'
os.makedirs(outdir, exist_ok=True)

In [12]:
# determining the current year, month and day
today = date.today()
date_str = today.strftime("%Y_%m_%d")

# determining current hour and minute
now = datetime.now()
time_str = now.strftime("%H_%M")

# setting the output filename
output = os.path.join(outdir, "GEO_Query.{}_{}".format(date_str, time_str))

In [13]:
# add a column that notes the added date
df_grouped['Date Added'] = date_str.replace('_', '-')
reorder = ['Paper Title', 'Journal', 'Authors', 'Year', 'DOI', 'GEO / Data link', 'Any other information',
           'Organism', 'Tissue/Cell Line', 'Other matched data', 'Presenter', 'Date Added']
df_grouped = df_grouped.loc[:, reorder]

In [14]:
df_grouped.to_excel(output + ".xlsx", index=False)

In [15]:
df_grouped

Unnamed: 0_level_0,Paper Title,Journal,Authors,Year,DOI,GEO / Data link,Any other information,Organism,Tissue/Cell Line,Other matched data,Presenter,Date Added
Paper Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3D genome landscapes of primary effusion lymphoma cell lines link super-enhancers to dependency factors,3D genome landscapes of primary effusion lymph...,Nat Commun,Wang et al.,2020,https://doi.org/10.1038/s41467-020-20136-w,GSE136090,Genome binding/occupancy profiling by high thr...,Homo sapiens,,GSM4040942: BCBL1_HiChIP (2 replicates)\nGSM40...,,2022-03-09
A Foxp3 mutation drives Th2 effector function in regulatory T cells,A Foxp3 mutation drives Th2 effector function ...,,,2018,,GSE112176,Expression profiling by high throughput sequen...,Mus musculus,,GSM3059350: Treg_WT HiChIP H3K27ac biological ...,,2022-03-09
A primed immune transcriptional program is activated in oligodendroglia in multiple sclerosis,A primed immune transcriptional program is act...,Neuron,Meijer et al.,2022,https://doi.org/10.1016/j.neuron.2021.12.034,GSE166177\nGSE166179,Other\nGenome binding/occupancy profiling by h...,Mus musculus,,GSM5065199: H3K27ac_HiChIP_IFN_mOPC_rep1\nGSM5...,,2022-03-09
A small set of accessible enhancers enables breast cancer cell response to physiological progestin concentrations,A small set of accessible enhancers enables br...,Nucleic Acids Res,Zaurin et al.,2021,https://doi.org/10.1093/nar/gkab1125,GSE179666,Expression profiling by high throughput sequen...,Homo sapiens,,GSM5425946: aHiChIP PR T0\nGSM5425945: aHiChIP...,,2022-03-09
A topological atlas reveals layers of genome reorganization in colorectal cancer,A topological atlas reveals layers of genome r...,Cell,Johnstone et al.,2020,https://doi.org/10.1016/j.cell.2020.07.030,GSE133928,Genome binding/occupancy profiling by high thr...,Homo sapiens,,GSM3930281: MGH1904_HiChIP\nGSM3930275: BRD316...,,2022-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...
Widespread Long-range Cis-Regulatory Elements in the Maize Genome,Widespread Long-range Cis-Regulatory Elements ...,Nat Plants,Ricci et al.,2019,https://doi.org/10.1038/s41477-019-0547-0,GSE120304,Expression profiling by high throughput sequen...,Zea mays,,GSM4284450: HiChIP B73 leaf H3K4me3\nGSM428445...,,2022-03-09
YY1 is a structural regulator of enhancer-promoter loops,YY1 is a structural regulator of enhancer-prom...,Cell,Weintraub et al.,2017,https://doi.org/10.1016/j.cell.2017.11.008,GSE99521\nGSE99519,Expression profiling by high throughput sequen...,Homo sapiens; Mus musculus,,GSM2774003: HiChIP_mES_C3_UT_H3K27ac_rep1\nGSM...,,2022-03-09
YY1 regulates DNA-DNA interaction,YY1 regulates DNA-DNA interaction,,,2020,,GSE128106,Genome binding/occupancy profiling by high thr...,Homo sapiens,,GSM3664990: HiChIP YY1 rep1\nGSM3664993: HiChI...,,2022-03-09
cLoops2: a full-stack comprehensive analytical tool for chromatin interactions,cLoops2: a full-stack comprehensive analytical...,Nucleic Acids Res,Cao et al.,2021,https://doi.org/10.1093/nar/gkab1233,GSE179010,Other,Homo sapiens,,GSM5403564: ChIC_GM12878_IgG_Rep1\nGSM5403558:...,,2022-03-09
