In [1]:
import os 
import argparse
import GEOparse
import pandas as pd
import numpy as np
import pysradb
import subprocess as sp

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

gse_id = 'GSE101498'

  from tqdm.autonotebook import tqdm


## Setting the output path

In [2]:
# # make an output directory
outdir = 'results/hichip_db/gse/'
os.makedirs(outdir, exist_ok=True)

# setting the output filename
output = os.path.join(outdir, "GSE_Query.{}.tsv".format(gse_id))

In [3]:
print('Writing the output to: "{}".'.format(output))

Writing the output to: "results/hichip_db/gse/GSE_Query.GSE101498.tsv".


## Query GEO for GSE metadata

In [4]:
# query the current GSE ID
geo_query = GEOparse.get_GEO(geo=gse_id, destdir=outdir, include_data=True, silent=True)

In [5]:
# parse through the information and make a useful table
gsm_data = []
for gsm_id, gsm in geo_query.gsms.items():
    
    title = '; '.join(gsm.metadata['title'])
    organism = ', '.join(gsm.metadata['organism_ch1'])
    source = ', '.join(gsm.metadata['source_name_ch1'])
    description = '; '.join(gsm.metadata['description'])
    
    for sra_link in gsm.relations['SRA']:
        # extracting the title, organism, source and description
        info = [gse_id,
                gsm_id,
                title,
                organism,
                source,
                description,
                sra_link]
        gsm_data.append(info)

gsm_data = pd.DataFrame(gsm_data)
gsm_data.columns = ['geo_id', 'gsm_id', 'title', 'organism', 'source', 'description', 'srx_link']

# extract SRA ID's
sra_ids = gsm_data['srx_link'].str.extract('(SRX[0-9]+)').squeeze()
gsm_data['srx_id'] = sra_ids

In [6]:
# loading the SRA tool
sra_querytool = pysradb.sraweb.SRAweb()

# query the SRA 
sra_query = sra_querytool.sra_metadata(gsm_data['srx_id'].values.tolist(), expand_sample_attributes=True)

In [7]:
meta = pd.merge(gsm_data,
                sra_query, left_on='srx_id',
                right_on='experiment_accession',
                suffixes=['_geo', '_sra'])

# calculating the number reads using the total number of spots
meta.loc[:, 'num_reads'] = meta.loc[:, 'total_spots'].astype(int) * 2

## Save the merged dataframe with all fields

In [8]:
meta_fn = os.path.join(outdir, '{}.meta.all_columns.xlsx'.format(gse_id))
meta.to_excel(meta_fn, index=False)

## Save the merged dataframe with most important columns

In [9]:
# most of these dropped columns are not needed, empty and some are redundant (specified)
# these columns are not dropped explicity with a drop call, but rather I extract only 
# the final columns I am interesting. The list below serves a book keeping purpose. 
drop_cols = ['sample_title', # empty
             'sample_organism', # redundant with organism
             'organism_taxid', 
             'library_name',
             'instrument',
             'instrument_model',
             'instrument_model_desc',
             'srx_link',
             'srx_id',
             'sample_accession',
             'study_accession',
             'study_title',
             'experiment_accession',
             'experiment_title',
             'experiment_desc',
             'organism_taxid',
             'library_name',
             'library_strategy',
             'library_source',
             'library_selection',
             'instrument',
             'instrument_model',
             'instrument_model_desc',
             'total_size',
             'run_total_spots',
             'run_total_bases', 
             'total_spots',
             'library_layout'] # not needed since all HiC data has to be completed with paired data

In [10]:
# setting the final columns
final_cols = ['geo_id',
             'gsm_id',
             'run_accession',
             'title',
             'source',
             'description',
             'organism',
             'num_reads']

# saving a table with the original column names
orig_cols_fn = os.path.join(outdir, '{}.meta.major_columns.original.xlsx'.format(gse_id))
meta[final_cols].to_excel(orig_cols_fn, index=False)

In [11]:
# renaming the columns
final_renames = {'run_accession': 'srr_id',
                 'title': 'geo_title',
                 'source': 'geo_source',
                 'description': 'geo_description'}

In [12]:
# extracting the final data
final_df = meta[final_cols].rename(columns=final_renames)

# saving a table with the new column names
renamed_cols_fn = os.path.join(outdir, '{}.meta.major_columns.renamed.xlsx'.format(gse_id))
final_df.to_excel(renamed_cols_fn, index=False)

In [13]:
final_df

Unnamed: 0,geo_id,gsm_id,srr_id,geo_title,geo_source,geo_description,organism,num_reads
0,GSE101498,GSM2705031,SRR5831479,mES HiChIP H3K27ac 25m biological replicate 1 ...,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,373303396
1,GSE101498,GSM2705032,SRR5831480,mES HiChIP H3K27ac 25m biological replicate 1 ...,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,269130644
2,GSE101498,GSM2705033,SRR5831481,mES HiChIP H3K27ac 25m biological replicate 2 ...,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,306801366
3,GSE101498,GSM2705034,SRR5831482,mES HiChIP H3K27ac 25m biological replicate 2 ...,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,359787238
4,GSE101498,GSM2705035,SRR5831483,mES HiChIP H3K27ac 500k biological replicate 1,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,249255802
5,GSE101498,GSM2705036,SRR5831484,mES HiChIP H3K27ac 500k biological replicate 2,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,288127916
6,GSE101498,GSM2705037,SRR5831485,mES HiChIP H3K27ac 100k biological replicate 1,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,497373606
7,GSE101498,GSM2705038,SRR5831486,mES HiChIP H3K27ac 100k biological replicate 2,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,115650484
8,GSE101498,GSM2705039,SRR5831487,mES HiChIP H3K27ac 50k biological replicate 1,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,230922500
9,GSE101498,GSM2705040,SRR5831488,mES HiChIP H3K27ac 50k biological replicate 2,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus,716004720
