# Process metadata

This notebook checks each experiment id is associated with gene expression data, via the run id, and returns a clean list of experiment ids that have gene expression data.  

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import glob
import pandas as pd
import numpy as np
import random

import warnings
warnings.filterwarnings(action='ignore')

sys.path.append("../")

from numpy.random import seed
randomState = 123
seed(randomState)

In [2]:
# User parameters
dataset_name = "Human_analysis"

In [3]:
# Input files

# base dir on repo
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../.."))  

mapping_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    "recount2_metadata.tsv")

normalized_data_file = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "input",
    "recount2_gene_normalized_data.tsv.xz")

In [4]:
# Output file
experiment_id_file = os.path.join(
    base_dir, 
    dataset_name,
    "data",
    "metadata", 
    "recount2_experiment_ids.txt")

### Get experiment ids

In [5]:
# Read in metadata
metadata = pd.read_table(
    mapping_file, 
    header=0, 
    sep='\t', 
    index_col=0)

metadata.head()

Unnamed: 0_level_0,sample,experiment,run,read_count_as_reported_by_sra,reads_downloaded,proportion_of_reads_reported_by_sra_downloaded,paired_end,sra_misreported_paired_end,mapped_read_count,auc,sharq_beta_tissue,sharq_beta_cell_type,biosample_submission_date,biosample_publication_date,biosample_update_date,avg_read_length,geo_accession,bigwig_file,title,characteristics
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
DRP000366,DRS000577,DRX000547,DRR000897,22635328.0,22635328.0,1.0,False,False,22355123.0,824970300.0,,,2013-01-18T09:05:14.983,2013-01-07T00:00:00.000,2014-11-12T03:28:52.000,37.0,,DRR000897.bw,,
DRP000425,DRS000730,DRX000770,DRR001173,26183592.0,26183592.0,1.0,False,False,23921095.0,856110500.0,,,2012-07-19T09:34:23.143,2012-06-30T00:00:00.000,2014-11-12T03:28:52.000,36.0,,DRR001173.bw,,
DRP000425,DRS000731,DRX000771,DRR001174,44305116.0,44305116.0,1.0,False,False,37455503.0,1413731000.0,,,2012-07-19T09:34:23.186,2012-06-30T00:00:00.000,2014-11-12T03:28:52.000,38.0,,DRR001174.bw,,
DRP000425,DRS000732,DRX000772,DRR001175,33547222.0,33547222.0,1.0,False,False,30752624.0,1163607000.0,,,2012-07-19T09:34:22.996,2012-06-30T00:00:00.000,2014-11-12T03:28:52.000,38.0,,DRR001175.bw,,
DRP000425,DRS000733,DRX000773,DRR001176,29691234.0,29691234.0,1.0,False,False,24349633.0,863415200.0,,,2012-07-19T09:34:23.233,2012-06-30T00:00:00.000,2014-11-12T03:28:52.000,36.0,,DRR001176.bw,,


In [6]:
map_experiment_sample = metadata[['run']]
map_experiment_sample.head()

Unnamed: 0_level_0,run
project,Unnamed: 1_level_1
DRP000366,DRR000897
DRP000425,DRR001173
DRP000425,DRR001174
DRP000425,DRR001175
DRP000425,DRR001176


In [7]:
experiment_ids = np.unique(np.array(map_experiment_sample.index)).tolist()
print("There are {} experiments in the compendium".format(len(experiment_ids)))

There are 3219 experiments in the compendium


### Get sample ids from gene expression data

In [8]:
normalized_data = pd.read_table(
    normalized_data_file,
    header=0,
    sep='\t',
    index_col=0).T

normalized_data.head()

Unnamed: 0,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000000938.12,ENSG00000000971.15,ENSG00000001036.13,ENSG00000001084.10,ENSG00000001167.14,...,ENSG00000283690.1,ENSG00000283691.1,ENSG00000283692.1,ENSG00000283693.1,ENSG00000283694.1,ENSG00000283695.1,ENSG00000283696.1,ENSG00000283697.1,ENSG00000283698.1,ENSG00000283699.1
SRR592745,0.818386,0.002542,0.035576,0.011681,0.007931,5.1e-05,0.345671,0.071517,0.051778,0.015726,...,0.0,0.001461,0.000881,0.0,0.0,0.0,0.002207,0.012247,0.0,0.0
SRR592746,0.428429,0.00137,0.049366,0.005159,0.00646,0.000436,0.220094,0.032957,0.038891,0.018589,...,0.0,0.0,0.0,0.0,0.009428,0.0,0.0,0.008735,0.0,0.0
SRR592747,0.527943,0.001268,0.103888,0.005841,0.006026,3.6e-05,0.275965,0.055772,0.057488,0.022814,...,0.0,0.002325,0.0022,0.002824,0.0,0.0,0.001086,0.015065,0.009785,0.0
SRR592748,0.183543,0.049132,0.048419,0.01045,0.012363,7.4e-05,0.130054,0.077064,0.018981,0.043705,...,0.0,0.00252,0.0,0.0,0.0,0.0,0.002586,0.019263,0.0,0.0
SRR592749,0.254788,0.389071,0.065743,0.009432,0.0145,3.5e-05,0.087228,0.081598,0.015481,0.048301,...,0.0,0.000193,0.0,0.0,0.0,0.0,0.001194,0.015296,0.0,0.0


In [9]:
sample_ids_with_gene_expression = list(normalized_data.index)

### Get samples belonging to selected experiment

In [10]:
experiment_ids_with_gene_expression = []

for experiment_id in experiment_ids:
    
    # Some project id values are descriptions
    # We will skip these
    if len(experiment_id) == 9:
        print(experiment_id)
        selected_metadata = metadata.loc[experiment_id]

        #print("There are {} samples in experiment {}".format(selected_metadata.shape[0], experiment_id))
        sample_ids = list(selected_metadata['run'])

        if any(x in sample_ids_with_gene_expression for x in sample_ids):
            experiment_ids_with_gene_expression.append(experiment_id)
        
print('There are {} experiments with gene expression data'.format(len(experiment_ids_with_gene_expression)))

DRP000366
DRP000425
DRP000464
DRP000499
DRP000527
DRP000622
DRP000665
DRP000929
DRP000987
DRP001048
DRP001055
DRP001149
DRP001150
DRP001194
DRP001219
DRP001220
DRP001280
DRP001358
DRP001797
DRP001919
DRP002380
DRP002435
DRP002586
DRP002623
DRP002625
DRP002667
DRP002672
DRP002712
DRP002721
DRP002835
DRP002851
DRP002860
ERP000546
ERP000573
ERP000619
ERP000710
ERP000787
ERP000799
ERP000959
ERP000992
ERP001115
ERP001304
ERP001317
ERP001344
ERP001458
ERP001574
ERP001782
ERP001828
ERP001895
ERP001908
ERP001942
ERP001948
ERP001971
ERP002021
ERP002045
ERP002049
ERP002063
ERP002075
ERP002232
ERP002414
ERP002588
ERP003259
ERP003460
ERP003467
ERP003471
ERP003495
ERP003536
ERP003613
ERP003617
ERP003731
ERP003789
ERP003791
ERP003815
ERP003917
ERP003933
ERP003984
ERP004006
ERP004043
ERP004062
ERP004078
ERP004094
ERP004151
ERP004209
ERP004211
ERP004219
ERP004269
ERP004270
ERP004298
ERP004352
ERP004375
ERP004399
ERP004402
ERP004573
ERP004578
ERP004592
ERP004617
ERP004682
ERP004683
ERP004684
ERP004697


SRP035554
SRP035599
SRP035617
SRP035634
SRP035638
SRP035641
SRP035665
SRP035670
SRP035679
SRP035862
SRP035864
SRP035883
SRP035930
SRP035934
SRP035988
SRP036029
SRP036035
SRP036053
SRP036133
SRP036136
SRP036145
SRP036595
SRP036769
SRP036790
SRP036821
SRP036840
SRP036843
SRP036848
SRP037550
SRP037553
SRP037579
SRP037718
SRP037719
SRP037722
SRP037735
SRP037762
SRP037775
SRP037778
SRP037971
SRP037982
SRP038006
SRP038101
SRP038143
SRP038695
SRP038702
SRP038726
SRP038759
SRP038761
SRP038767
SRP038863
SRP038919
SRP038921
SRP038925
SRP038963
SRP038964
SRP038969
SRP038987
SRP039039
SRP039077
SRP039085
SRP039338
SRP039346
SRP039348
SRP039354
SRP039357
SRP039359
SRP039361
SRP039397
SRP039399
SRP039460
SRP039552
SRP039559
SRP039591
SRP039598
SRP039694
SRP039909
SRP040014
SRP040070
SRP040110
SRP040117
SRP040136
SRP040145
SRP040236
SRP040243
SRP040275
SRP040278
SRP040288
SRP040292
SRP040300
SRP040309
SRP040327
SRP040328
SRP040418
SRP040421
SRP040442
SRP040454
SRP040472
SRP040505
SRP040525
SRP040547


SRP063840
SRP063860
SRP063867
SRP063889
SRP063948
SRP063978
SRP063980
SRP064131
SRP064138
SRP064142
SRP064143
SRP064149
SRP064259
SRP064264
SRP064270
SRP064316
SRP064317
SRP064321
SRP064323
SRP064378
SRP064410
SRP064454
SRP064457
SRP064458
SRP064464
SRP064475
SRP064481
SRP064538
SRP064547
SRP064561
SRP064562
SRP064624
SRP064625
SRP064652
SRP064661
SRP064671
SRP064735
SRP064781
SRP064783
SRP064803
SRP064820
SRP064863
SRP064894
SRP064956
SRP064967
SRP065022
SRP065114
SRP065120
SRP065127
SRP065146
SRP065153
SRP065196
SRP065202
SRP065219
SRP065258
SRP065282
SRP065330
SRP065445
SRP065451
SRP065468
SRP065491
SRP065500
SRP065559
SRP065763
SRP065774
SRP065812
SRP065848
SRP065849
SRP065988
SRP066008
SRP066009
SRP066150
SRP066151
SRP066152
SRP066356
SRP066363
SRP066371
SRP066394
SRP066424
SRP066449
SRP066482
SRP066484
SRP066488
SRP066596
SRP066729
SRP066834
SRP066889
SRP066895
SRP066912
SRP066917
SRP066934
SRP066956
SRP066959
SRP066994
SRP067173
SRP067214
SRP067221
SRP067312
SRP067318
SRP067469


In [11]:
experiment_ids_with_gene_expression_df = pd.DataFrame(experiment_ids_with_gene_expression, columns=['experiment_id'])
experiment_ids_with_gene_expression_df.head()

Unnamed: 0,experiment_id
0,DRP001149
1,ERP005953
2,ERP008992
3,SRP000762
4,SRP003902


In [12]:
# Save simulated data
experiment_ids_with_gene_expression_df.to_csv(experiment_id_file, sep='\t')