In [1]:
import os
import glob
import sys
import pandas as pd
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
latest_date

'2022.07.03.17.47'

## Check Google Samplesheet at SRR Level

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/{}.fastq.google-samplesheet.tsv'.format(latest_date)
    #output_prefix = 'results/samplesheets/fastq/{}.hicpro.samplesheet'.format(latest_date)
else:
    input_fn = sys.argv[1]
    #output_prefix = sys.argv[2]

In [4]:
# loading the samplesheet; includes downloaded and NOT downloaded
df = pd.read_table(input_fn)

In [5]:
# extract those samples which are downloaded according to google tracker
downloaded_df = df.loc[(df['Download Status'] == 1)].reset_index(drop=True)

# extract only the columns needed
major_cols = ['Sample Name (as used in the server)',
 'GSE ID',
 'GSM ID',
 'SRR ID',
 'Organism',
 'Biological Replicate Serial No',
 'Technical Replicate Serial No',
 'ChIP-seq Pull Down',
 'Restriction Enzyme']
downloaded_df = downloaded_df[major_cols]

In [6]:
# capitalizes organism
def parse_organism(string):
    new_words = []
    for word in string.split():
        new_words.append(word.capitalize())
    new_string = '_'.join(new_words)
    return(new_string)

downloaded_df.loc[:, 'Organism'] = downloaded_df.loc[:, 'Organism'].apply(parse_organism)

In [7]:
# getting the sample names
sample_names = []
for i, sr in downloaded_df.iterrows():
    sample_name = '{sample_name}.{gse_id}.{organism}.{antibody_target}.b{biological_rep}'
    sample_name = sample_name.format(sample_name=sr[0],
                                     gse_id=sr[1],
                                     organism=sr[4],
                                     antibody_target=sr[7], 
                                     biological_rep=sr[5])
    sample_names.append(sample_name)
downloaded_df.loc[:, 'sample_name'] = sample_names

In [8]:
# renaming the columns for easy computational use 
downloaded_df.columns = ['sample_name', 'gse_id', 'gsm_id', 'srr_id',
                    'organism', 'bio_rep', 'tech_rep', 'antibody_target',
                    'restriction_enzyme', 'std_sample_name']

In [9]:
# reorder the columns
reorder = ['std_sample_name',
             'gse_id',
             'gsm_id',
             'srr_id',
             'organism',
             'bio_rep',
             'tech_rep',
             'antibody_target',
             'restriction_enzyme',
             'sample_name']
downloaded_df = downloaded_df[reorder]
print("full google samplesheet: all of following SRRs were downloaded")
downloaded_df

full google samplesheet: all of following SRRs were downloaded


Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name
0,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,GSM2705041,SRR5831489,Homo_Sapiens,1,1,H3K27ac,MboI,GM12878
1,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,GSM2705042,SRR5831490,Homo_Sapiens,2,1,H3K27ac,MboI,GM12878
2,K562.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,GSM2705043,SRR5831491,Homo_Sapiens,1,1,H3K27ac,MboI,K562
3,K562.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,GSM2705044,SRR5831492,Homo_Sapiens,2,1,H3K27ac,MboI,K562
4,K562.GSE101498.Homo_Sapiens.H3K27ac.b3,GSE101498,GSM2705045,SRR5831493,Homo_Sapiens,3,1,H3K27ac,MboI,K562
5,MyLa.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,GSM2705046,SRR5831494,Homo_Sapiens,1,1,H3K27ac,MboI,MyLa
6,MyLa.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,GSM2705047,SRR5831495,Homo_Sapiens,2,1,H3K27ac,MboI,MyLa
7,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,GSM2705048,SRR5831496,Homo_Sapiens,1,1,H3K27ac,MboI,Naive_Tcells
8,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,GSM2705049,SRR5831497,Homo_Sapiens,2,1,H3K27ac,MboI,Naive_Tcells
9,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,GSM2705050,SRR5831498,Homo_Sapiens,2,2,H3K27ac,MboI,Naive_Tcells


In [10]:
print("number of srr's expected downloaded:", len(downloaded_df))
print("nubmer of srr files expected:", len(downloaded_df)*2)
print("duplicate srrs in google samplesheet:")
print(downloaded_df[downloaded_df['srr_id'].duplicated() == True].get('srr_id'))

number of srr's expected downloaded: 474
nubmer of srr files expected: 948
duplicate srrs in google samplesheet:
Series([], Name: srr_id, dtype: object)


## Check Google Samplesheet at Sample/Bio Rep Level

In [11]:
bio_df = downloaded_df.drop_duplicates(subset=['std_sample_name', 'gse_id', 'gsm_id', 'srr_id']).reset_index(drop=True)
bio_df.drop(['tech_rep'], axis=1, inplace=True)

In [12]:
reorder = ['std_sample_name',
             'gse_id',
             'organism',
             'bio_rep',
             'antibody_target',
             'restriction_enzyme',
             'sample_name']

In [13]:
bio_df = bio_df[reorder]
bio_df = bio_df.drop_duplicates('std_sample_name').reset_index(drop=True)
bio_df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name
0,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,Homo_Sapiens,1,H3K27ac,MboI,GM12878
1,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,Homo_Sapiens,2,H3K27ac,MboI,GM12878
2,K562.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,Homo_Sapiens,1,H3K27ac,MboI,K562
3,K562.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,Homo_Sapiens,2,H3K27ac,MboI,K562
4,K562.GSE101498.Homo_Sapiens.H3K27ac.b3,GSE101498,Homo_Sapiens,3,H3K27ac,MboI,K562
5,MyLa.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,Homo_Sapiens,1,H3K27ac,MboI,MyLa
6,MyLa.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,Homo_Sapiens,2,H3K27ac,MboI,MyLa
7,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,Homo_Sapiens,1,H3K27ac,MboI,Naive_Tcells
8,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,Homo_Sapiens,2,H3K27ac,MboI,Naive_Tcells
9,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b3,GSE101498,Homo_Sapiens,3,H3K27ac,MboI,Naive_Tcells


In [14]:
print("number of sample folders expected:", len(bio_df))

number of sample folders expected: 338


## Check Actual Downloads in Directory

In [15]:
organism = 'Homo_Sapiens'
actual_bio_df = pd.DataFrame([sample for sample in os.listdir('results/fastqs/raw/') if organism in sample])
actual_bio_df.columns = ['std_sample_name']
print("number of sample folders actual:", len(actual_bio_df))
if len(bio_df) == len(actual_bio_df):
    print("number of bio reps in google matches number of bio reps in dir:", len(actual_bio_df))
else:
    print("number of bio reps in google does not match number of bio reps in dir")

actual_bio_df = actual_bio_df.sort_values(by=['std_sample_name'], ascending=True).reset_index(drop=True)
actual_names = actual_bio_df['std_sample_name'].to_list()
bio_df = bio_df.sort_values(by=['std_sample_name'], ascending=True).reset_index(drop=True)
expected_names = bio_df['std_sample_name'].to_list()

for i in range(len(expected_names)):
    if expected_names[i] != actual_names[i]:
        print("name mismatch, expected", expected_names[i], "and actually is", actual_names[i])


number of sample folders actual: 338
number of bio reps in google matches number of bio reps in dir: 338


In [16]:
# check that all folders have expected SRRs in them
samples = glob.glob('results/fastqs/raw/*Homo_Sapiens*/')

for sample in samples:
    os.chdir(sample)
    srr_files = glob.glob('SRR*')
    srr_1_files = [srr_file for srr_file in srr_files if '_1' in srr_file]
    srr_2_files = [srr_file for srr_file in srr_files if '_2' in srr_file]
    
    expected = downloaded_df.loc[downloaded_df['std_sample_name'] == sample.split('/')[3]]
    expected_srrs = expected['srr_id'].to_list()
    
    srrs = [srr.split('_')[0] for srr in srr_files]
    uniq_srr_ids = []
    for srr in srrs:
        if srr not in uniq_srr_ids:
            uniq_srr_ids.append(srr)  
    
    if expected_srrs.sort() != uniq_srr_ids.sort():
        print("this sample has missing or extra SRR IDs downloaded:", sample)   
    
    if len(srr_files) % 2 != 0:
        print('this sample has an odd number of SRR files:', sample)
    if len(srr_1_files) != len(srr_2_files):
        print('this sample does not have an equal number of R1/R2 files:', sample)
    for srr_file in srr_files:
        if '_1' not in srr_file and '_2' not in srr_file:
            print('this sample has a srr file without R1 or R2 designation:', sample)
    os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')

    

In [17]:
srrs = glob.glob('results/fastqs/raw/*Homo_Sapiens*/SRR*')
srrs = [srr.split('/')[4] for srr in srrs]
srr_ids = [srr.split('_')[0] for srr in srrs]
uniq_srr_ids = []
for srr in srr_ids:
    if srr not in uniq_srr_ids:
        uniq_srr_ids.append(srr)

if len(uniq_srr_ids) == len(downloaded_df):
    print("number of SRR ids present matches:", len(uniq_srr_ids))
else:
    print("number of SRR ids present does not match:")

if len(srr_ids) == len(downloaded_df)*2:
    print("number of SRR files present matches:", len(srr_ids))
else:
    print("number of SRR files present does not match:", len(srr_ids), "vs.",  len(downloaded_df)*2)

number of SRR ids present matches: 474
number of SRR files present matches: 948
