In [1]:
# Define the sample HT-Seq datafiles
datadir = '/data/BIDS-HPC/private/projects/dmi2/data/all_gene_expression_files_in_target/links'
file_fpkm = 'fffee315-9aa3-44d2-8c89-78a2c1d107e7.FPKM.txt'
file_fpkm_uq = 'fffee315-9aa3-44d2-8c89-78a2c1d107e7.FPKM-UQ.txt'
file_counts = 'fffee315-9aa3-44d2-8c89-78a2c1d107e7.htseq_counts.txt'

# Define the reference datafiles
gdc_tsv_file = '/data/BIDS-HPC/private/projects/dmi2/data/gencode.gene.info.v22.tsv'
gencode_gtf_file = '/data/BIDS-HPC/private/projects/dmi2/data/gencode.v22.annotation.gtf'

In [2]:
# Read in all the datafiles into Pandas dataframes
import pandas as pd
import os
df_fpkm = pd.read_csv(os.path.join(datadir, file_fpkm), sep='\t', names=['id','intensity'])
df_fpkm_uq = pd.read_csv(os.path.join(datadir, file_fpkm_uq), sep='\t', names=['id','intensity'])
df_count = pd.read_csv(os.path.join(datadir, file_counts), sep='\t', skipfooter=5, names=['id','intensity'])
df_samples = [df_count, df_fpkm, df_fpkm_uq]
df_gdc = pd.read_csv(gdc_tsv_file, sep='\t')
df_gencode = pd.read_csv(gencode_gtf_file, sep='\t', skiprows=5, header=None)
df_gencode_genes = df_gencode[df_gencode[2]=='gene'].reset_index(drop=True)
df_gencode_exons = df_gencode[df_gencode[2]=='exon'].reset_index(drop=True)

In [3]:
# Format the sample dataframes for consistency
for idf, df in enumerate(df_samples):
    df = df.set_index('id')
    df_samples[idf] = df.sort_index()

In [4]:
# Format the df_gencode_genes dataframe for consistency
df_gencode_genes['id'] = df_gencode_genes.apply(lambda x: x[8].split()[1].split('\"')[1], axis=1)
df_gencode_genes['type'] = df_gencode_genes.apply(lambda x: x[8].split()[3].split('\"')[1], axis=1)
df_gencode_genes['name'] = df_gencode_genes.apply(lambda x: x[8].split()[7].split('\"')[1], axis=1)
df_gencode_genes = df_gencode_genes.rename({3: 'start', 4: 'end', 6: 'strand', 0: 'seqname'}, axis='columns')
df_gencode_genes = df_gencode_genes.set_index('id')
df_gencode_genes = df_gencode_genes.sort_index()

In [5]:
# Format the df_gencode_exons dataframe for consistency
df_gencode_exons['id'] = df_gencode_exons.apply(lambda x: x[8].split()[1].split('\"')[1], axis=1)
df_gencode_exons['type'] = df_gencode_exons.apply(lambda x: x[8].split()[3].split('\"')[1], axis=1)
df_gencode_exons['name'] = df_gencode_exons.apply(lambda x: x[8].split()[7].split('\"')[1], axis=1)
df_gencode_exons = df_gencode_exons.rename({3: 'start', 4: 'end', 6: 'strand', 0: 'seqname'}, axis='columns')
df_gencode_exons = df_gencode_exons.set_index('id')
df_gencode_exons = df_gencode_exons.sort_index()

In [6]:
# Format the df_gdc dataframe for consistency
df_gdc = df_gdc.rename({'gene_id': 'id', 'gene_name': 'name', 'gene_type': 'type'}, axis='columns')
df_gdc = df_gdc.set_index('id')
df_gdc = df_gdc.sort_index()

In [7]:
# Check for column equality between the two reference datafiles
for colname in ['name', 'seqname', 'start', 'end', 'strand', 'type']:
    print(df_gdc[colname].equals(df_gencode_genes[colname]))

True
True
True
True
True
True


In [8]:
# Check that the ID columns of all five dataframes are exactly the same
dfs = df_samples + [df_gdc, df_gencode_genes]
ndfs = len(dfs)
import numpy as np
for idf1 in range(ndfs-1):
    for idf2 in np.array(range(ndfs-1-idf1)) + idf1+1:
        df1 = dfs[idf1]
        df2 = dfs[idf2]
        print(idf1, idf2, df1.index.equals(df2.index))

0 1 True
0 2 True
0 3 True
0 4 True
1 2 True
1 3 True
1 4 True
2 3 True
2 4 True
3 4 True


In [9]:
# Calculate the exon length of each gene (corresponding to its non-overlapping exons) and add this as a column to the df_gencode_genes dataframe
# Takes about 10 minutes

# Import relevant library
import numpy as np

# Set the number of steps to output so we can evaluate progress
nsteps = 100

# Set the step size in units of the size of the df_gencode_exons dataframe
unit_len = int(len(df_gencode_exons) / nsteps)

# Initialize some values
istep = 0 # the step that we're on
exon_lengths = [] # the array holding the final exon gene lengths (non-overlapping union of exon base pairs)
prev_idx = '' # set the previous index to null

# For every index in the ordered-by-index exons dataframe...
for iidx, idx in enumerate(df_gencode_exons.index):

    # Get the current row of data in the dataframe
    curr_row = df_gencode_exons.iloc[iidx,:]
    
    # Output progress if the time is right
    if (iidx%unit_len) == 0:
        print('{}/{} complete...'.format(istep,nsteps))
        istep = istep + 1

    # If the current index is not equal to the previous index...
    if idx != prev_idx:

        # If the previous index is not null (i.e., if this isn't the very first loop iteration and therefore base_pairs has been initialized below), calculate and store the number of unique base pairs for the current unique idx
        if prev_idx != '':
            exon_lengths.append(len(set(np.concatenate(base_pairs))))

        # Initialize the base_pairs holder (which will ultimately be a list of lists of base pairs)
        base_pairs = []

    # Always append the current set of base pairs corresponding to curr_row to the base_pairs list
    base_pairs.append(np.arange(curr_row['start'], curr_row['end']+1))

    # Set the previous index to the current index
    prev_idx = idx

# Calculate and store the number of unique base pairs for the final unique idx
exon_lengths.append(len(set(np.concatenate(base_pairs))))

# Add a column of exon gene length to the genes dataframe
df_gencode_genes['exon_length'] = exon_lengths

0/100 complete...
1/100 complete...
2/100 complete...
3/100 complete...
4/100 complete...
5/100 complete...
6/100 complete...
7/100 complete...
8/100 complete...
9/100 complete...
10/100 complete...
11/100 complete...
12/100 complete...
13/100 complete...
14/100 complete...
15/100 complete...
16/100 complete...
17/100 complete...
18/100 complete...
19/100 complete...
20/100 complete...
21/100 complete...
22/100 complete...
23/100 complete...
24/100 complete...
25/100 complete...
26/100 complete...
27/100 complete...
28/100 complete...
29/100 complete...
30/100 complete...
31/100 complete...
32/100 complete...
33/100 complete...
34/100 complete...
35/100 complete...
36/100 complete...
37/100 complete...
38/100 complete...
39/100 complete...
40/100 complete...
41/100 complete...
42/100 complete...
43/100 complete...
44/100 complete...
45/100 complete...
46/100 complete...
47/100 complete...
48/100 complete...
49/100 complete...
50/100 complete...
51/100 complete...
52/100 complete...
53/

In [22]:
# Show that we've reproduced what GDC calls the "exon_length" and what I'm assuming is probably the "aggregate_length" as well
df_gencode_genes = df_gencode_genes.rename({'aggregate_length': 'exon_length'}, axis='columns')
print(df_gencode_genes['exon_length'].equals(df_gdc['exon_length']))

# Show that using these exon lengths we have achieved adjusted counts that are proportional to the FPKM values
tmp = df_samples[0]['intensity'] / df_gencode_genes['exon_length'] / df_samples[1]['intensity']
tmp = tmp[tmp.notnull()]
print(tmp.std()/tmp.mean()*100, (tmp-tmp.mean()).abs().max()/tmp.mean()*100)

True
0.09319213127634611 3.132997043405492


In [27]:
tol = 1e-6
((df_fpkm['fpkm']/df_fpkm['fpkm'].sum() - df_fpkm_uq['fpkm-uq']/df_fpkm_uq['fpkm-uq'].sum()).abs() < tol).sum() # This line shows that the FPKM and FPKM-UQ files have the same values up to a fixed normalization
((df_counts['count']/df_counts['count'].sum() - df_fpkm_uq['fpkm-uq']/df_fpkm_uq['fpkm-uq'].sum()).abs() < tol).sum() # this line shows that this is not true of the counts

43200

In [30]:
#set(df_fpkm['id']) - set(df_gdc['gene_id'])
set(df_fpkm['id']) == set(df_gdc['gene_id'])

True

In [68]:
import numpy as np
arr = df_fpkm['fpkm'].to_numpy()
quantiles = np.quantile(arr, [0,0.25,0.5,0.75,1])
#arr[quantiles[0]<arr and arr<quantiles[3]]
print(arr[(quantiles[3]<arr) & (arr<quantiles[4])].sum())
1/(df_fpkm['fpkm'].max()/df_fpkm_uq['fpkm-uq'].max())

331163.3667


26613.247395329272

In [45]:
arr.max()/2

3865.25345

In [29]:
pd.qcut(df_fpkm['fpkm'], 4)

ValueError: Bin edges must be unique: array([0.0000000e+00, 0.0000000e+00, 4.7900000e-02, 8.8105000e-01,
       7.7305069e+03]).
You can drop duplicate edges by setting the 'duplicates' kwarg

In [9]:
df_counts

Unnamed: 0,id,count
0,ENSG00000000003.13,11205
1,ENSG00000000005.5,334
2,ENSG00000000419.11,4022
3,ENSG00000000457.12,539
4,ENSG00000000460.15,898
...,...,...
60478,ENSGR0000275287.3,0
60479,ENSGR0000276543.3,0
60480,ENSGR0000277120.3,0
60481,ENSGR0000280767.1,0


In [83]:
#df_gdc.index.equals(df_gencode.index)
#df_gdc['name'].equals(df_gencode['name'])
#df_gdc['seqname'].equals(df_gencode['seqname'])
#df_gdc['start'].equals(df_gencode['start'])
#df_gdc['end'].equals(df_gencode['end'])
#df_gdc['strand'].equals(df_gencode['strand'])
#df_gdc['type'].equals(df_gencode['type'])

KeyError: 'id'

In [70]:
df_gdc

Unnamed: 0_level_0,name,seqname,start,end,strand,type,gene_status,havana_gene,full_length,exon_length,exon_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000000003.13,TSPAN6,chrX,100627109,100639991,-,protein_coding,KNOWN,OTTHUMG00000022002.1,12883,4535,34
ENSG00000000005.5,TNMD,chrX,100584802,100599885,+,protein_coding,KNOWN,OTTHUMG00000022001.1,15084,1610,10
ENSG00000000419.11,DPM1,chr20,50934867,50958555,-,protein_coding,KNOWN,OTTHUMG00000032742.2,23689,1207,53
ENSG00000000457.12,SCYL3,chr1,169849631,169894267,-,protein_coding,KNOWN,OTTHUMG00000035941.4,44637,6883,63
ENSG00000000460.15,C1orf112,chr1,169662007,169854080,+,protein_coding,KNOWN,OTTHUMG00000035821.7,192074,5967,151
...,...,...,...,...,...,...,...,...,...,...,...
ENSGR0000275287.3,Metazoa_SRP,chrY,388100,388389,-,misc_RNA,NOVEL,,290,290,1
ENSGR0000276543.3,AJ271736.1,chrY,57209151,57209218,+,miRNA,NOVEL,,68,68,1
ENSGR0000277120.3,MIR6089,chrY,2609191,2609254,+,miRNA,KNOWN,,64,64,1
ENSGR0000280767.1,RP13-465B17.5,chrY,419157,421980,+,lincRNA,NOVEL,OTTHUMG00000189993.1,2824,515,2


In [71]:
df_gencode

Unnamed: 0_level_0,seqname,1,2,start,end,5,strand,7,8,type,name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000000003.13,chrX,HAVANA,gene,100627109,100639991,.,-,.,"gene_id ""ENSG00000000003.13""; gene_type ""prote...",protein_coding,TSPAN6
ENSG00000000005.5,chrX,HAVANA,gene,100584802,100599885,.,+,.,"gene_id ""ENSG00000000005.5""; gene_type ""protei...",protein_coding,TNMD
ENSG00000000419.11,chr20,HAVANA,gene,50934867,50958555,.,-,.,"gene_id ""ENSG00000000419.11""; gene_type ""prote...",protein_coding,DPM1
ENSG00000000457.12,chr1,HAVANA,gene,169849631,169894267,.,-,.,"gene_id ""ENSG00000000457.12""; gene_type ""prote...",protein_coding,SCYL3
ENSG00000000460.15,chr1,HAVANA,gene,169662007,169854080,.,+,.,"gene_id ""ENSG00000000460.15""; gene_type ""prote...",protein_coding,C1orf112
...,...,...,...,...,...,...,...,...,...,...,...
ENSGR0000275287.3,chrY,ENSEMBL,gene,388100,388389,.,-,.,"gene_id ""ENSGR0000275287.3""; gene_type ""misc_R...",misc_RNA,Metazoa_SRP
ENSGR0000276543.3,chrY,ENSEMBL,gene,57209151,57209218,.,+,.,"gene_id ""ENSGR0000276543.3""; gene_type ""miRNA""...",miRNA,AJ271736.1
ENSGR0000277120.3,chrY,ENSEMBL,gene,2609191,2609254,.,+,.,"gene_id ""ENSGR0000277120.3""; gene_type ""miRNA""...",miRNA,MIR6089
ENSGR0000280767.1,chrY,HAVANA,gene,419157,421980,.,+,.,"gene_id ""ENSGR0000280767.1""; gene_type ""lincRN...",lincRNA,RP13-465B17.5


In [17]:
df1 = pd.DataFrame({1: [10,11], 2: [20,21]}, index=['a','b'])
df2 = pd.DataFrame({1: [11,10], 2: [21,20]}, index=['b','a'])
df2.sort_index().equals(df1)

True

In [15]:
df2.sort_index()

Unnamed: 0,1,2
a,10,20
b,11,21


In [47]:
df_gencode['id'] = df_gencode.apply(lambda x: x[8].split()[1].split('\"')[1], axis=1)
df_gencode['type'] = df_gencode.apply(lambda x: x[8].split()[3].split('\"')[1], axis=1)
df_gencode['name'] = df_gencode.apply(lambda x: x[8].split()[7].split('\"')[1], axis=1)
df_gencode = df_gencode.rename({3: 'start', 4: 'stop'}, axis='columns')
df_gencode = df_gencode.set_index('id')
df_gencode = df_gencode.sort_index()

In [45]:
x = df_samples[0]['intensity'] / df_gdc['exon_length'] / df_samples[1]['intensity']
#x = df_samples[2]['intensity'] / df_samples[1]['intensity']
x[x.notnull()].std() / x[x.notnull()].mean() * 100

0.09319213127634611

In [15]:
df_gdc

Unnamed: 0_level_0,name,seqname,start,end,strand,type,gene_status,havana_gene,full_length,exon_length,exon_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000000003.13,TSPAN6,chrX,100627109,100639991,-,protein_coding,KNOWN,OTTHUMG00000022002.1,12883,4535,34
ENSG00000000005.5,TNMD,chrX,100584802,100599885,+,protein_coding,KNOWN,OTTHUMG00000022001.1,15084,1610,10
ENSG00000000419.11,DPM1,chr20,50934867,50958555,-,protein_coding,KNOWN,OTTHUMG00000032742.2,23689,1207,53
ENSG00000000457.12,SCYL3,chr1,169849631,169894267,-,protein_coding,KNOWN,OTTHUMG00000035941.4,44637,6883,63
ENSG00000000460.15,C1orf112,chr1,169662007,169854080,+,protein_coding,KNOWN,OTTHUMG00000035821.7,192074,5967,151
...,...,...,...,...,...,...,...,...,...,...,...
ENSGR0000275287.3,Metazoa_SRP,chrY,388100,388389,-,misc_RNA,NOVEL,,290,290,1
ENSGR0000276543.3,AJ271736.1,chrY,57209151,57209218,+,miRNA,NOVEL,,68,68,1
ENSGR0000277120.3,MIR6089,chrY,2609191,2609254,+,miRNA,KNOWN,,64,64,1
ENSGR0000280767.1,RP13-465B17.5,chrY,419157,421980,+,lincRNA,NOVEL,OTTHUMG00000189993.1,2824,515,2


In [13]:
df_fpkm

Unnamed: 0,id,intensity
0,ENSG00000000003.13,48.2789
1,ENSG00000000005.5,4.0536
2,ENSG00000000419.11,65.1115
3,ENSG00000000457.12,1.5301
4,ENSG00000000460.15,2.9406
...,...,...
60478,ENSGR0000275287.3,0.0000
60479,ENSGR0000276543.3,0.0000
60480,ENSGR0000277120.3,0.0000
60481,ENSGR0000280767.1,0.0000


In [48]:
df_gencode_full

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,HAVANA,gene,11869,14409,.,+,.,"gene_id ""ENSG00000223972.5""; gene_type ""transc..."
1,chr1,HAVANA,transcript,11869,14409,.,+,.,"gene_id ""ENSG00000223972.5""; transcript_id ""EN..."
2,chr1,HAVANA,exon,11869,12227,.,+,.,"gene_id ""ENSG00000223972.5""; transcript_id ""EN..."
3,chr1,HAVANA,exon,12613,12721,.,+,.,"gene_id ""ENSG00000223972.5""; transcript_id ""EN..."
4,chr1,HAVANA,exon,13221,14409,.,+,.,"gene_id ""ENSG00000223972.5""; transcript_id ""EN..."
...,...,...,...,...,...,...,...,...,...
2563666,chrM,ENSEMBL,transcript,15888,15953,.,+,.,"gene_id ""ENSG00000210195.2""; transcript_id ""EN..."
2563667,chrM,ENSEMBL,exon,15888,15953,.,+,.,"gene_id ""ENSG00000210195.2""; transcript_id ""EN..."
2563668,chrM,ENSEMBL,gene,15956,16023,.,-,.,"gene_id ""ENSG00000210196.2""; gene_type ""Mt_tRN..."
2563669,chrM,ENSEMBL,transcript,15956,16023,.,-,.,"gene_id ""ENSG00000210196.2""; transcript_id ""EN..."


In [15]:
#df_gencode_full.loc[df_gencode_full[2]=='gene',:].reindex()
#df_gencode_full[df_gencode_full[2]=='gene'].equals(pd.read_csv(gencode_gtf_file, sep='\t', skiprows=5, header=None))
df_gencode_full[df_gencode_full[2]=='gene'].reset_index(drop=True).equals(pd.read_csv(gencode_gtf_file, sep='\t', skiprows=5, header=None))

True

In [13]:
pd.read_csv(gencode_gtf_file, sep='\t', skiprows=5, header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,HAVANA,gene,11869,14409,.,+,.,"gene_id ""ENSG00000223972.5""; gene_type ""transc..."
1,chr1,HAVANA,gene,14404,29570,.,-,.,"gene_id ""ENSG00000227232.5""; gene_type ""unproc..."
2,chr1,ENSEMBL,gene,17369,17436,.,-,.,"gene_id ""ENSG00000278267.1""; gene_type ""miRNA""..."
3,chr1,HAVANA,gene,29554,31109,.,+,.,"gene_id ""ENSG00000243485.3""; gene_type ""lincRN..."
4,chr1,ENSEMBL,gene,30366,30503,.,+,.,"gene_id ""ENSG00000274890.1""; gene_type ""miRNA""..."
...,...,...,...,...,...,...,...,...,...
60478,chrM,ENSEMBL,gene,14149,14673,.,-,.,"gene_id ""ENSG00000198695.2""; gene_type ""protei..."
60479,chrM,ENSEMBL,gene,14674,14742,.,-,.,"gene_id ""ENSG00000210194.1""; gene_type ""Mt_tRN..."
60480,chrM,ENSEMBL,gene,14747,15887,.,+,.,"gene_id ""ENSG00000198727.2""; gene_type ""protei..."
60481,chrM,ENSEMBL,gene,15888,15953,.,+,.,"gene_id ""ENSG00000210195.2""; gene_type ""Mt_tRN..."


In [20]:
# for id in df_gencode_genes[:5].index:
#     print(id)
df_gencode_exons.index[:100]

Index(['ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000005.5', 'ENSG00000000005.5',
       'ENSG00000000005.5', 'ENSG00000000005.5', 'ENSG00000000005.5',
       'ENSG00000000005.5', 'ENSG00000000005.5', 'ENSG00

In [25]:
aggregate_lengths = []
prev_idx = ''
for iidx, idx in enumerate(df_gencode_exons.index[:100]):
    row = df_gencode_exons.iloc[iidx,:]
    curr_len = row['end'] - row['start'] + 1
    print(row)
    if idx != prev_idx:
        if prev_idx != '':
            aggregate_lengths.append(tmp_len)
        tmp_len = 0
    tmp_len = tmp_len + curr_len
    prev_idx = idx
aggregate_lengths.append(tmp_len)

                                              KNOWN
Name: ENSG00000000419.11, dtype: object
seqname                                                chr20
1                                                     HAVANA
2                                                       exon
start                                               50945847
end                                                 50945923
5                                                          .
strand                                                     -
7                                                          .
8          gene_id "ENSG00000000419.11"; transcript_id "E...
type                                       ENST00000371582.7
name                                                   KNOWN
Name: ENSG00000000419.11, dtype: object
seqname                                                chr20
1                                                     HAVANA
2                                                       exon
start         

In [38]:
df_gencode_exons.index[:100]
tmp = df_gencode_exons[df_gencode_exons.index=='ENSG00000000457.12']
(tmp['end'] - tmp['start'] + 1).sum()

15985

In [12]:
mylist = [5,5,2,2,2,9,6,6,6]

In [17]:
aggregate_lengths = []
prev_num = -1
for num in mylist:
    if num != prev_num:
        if prev_num != -1:
            aggregate_lengths.append(tmp_len)
        tmp_len = 0
    tmp_len = tmp_len + num
    prev_num = num
aggregate_lengths.append(tmp_len)

In [159]:
df_gencode_genes[:10]

Unnamed: 0_level_0,seqname,1,2,start,end,5,strand,7,8,type,name,aggregate_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000000003.13,chrX,HAVANA,gene,100627109,100639991,.,-,.,"gene_id ""ENSG00000000003.13""; gene_type ""prote...",protein_coding,TSPAN6,8747
ENSG00000000005.5,chrX,HAVANA,gene,100584802,100599885,.,+,.,"gene_id ""ENSG00000000005.5""; gene_type ""protei...",protein_coding,TNMD,1881
ENSG00000000419.11,chr20,HAVANA,gene,50934867,50958555,.,-,.,"gene_id ""ENSG00000000419.11""; gene_type ""prote...",protein_coding,DPM1,5867
ENSG00000000457.12,chr1,HAVANA,gene,169849631,169894267,.,-,.,"gene_id ""ENSG00000000457.12""; gene_type ""prote...",protein_coding,SCYL3,15985
ENSG00000000460.15,chr1,HAVANA,gene,169662007,169854080,.,+,.,"gene_id ""ENSG00000000460.15""; gene_type ""prote...",protein_coding,C1orf112,21871
ENSG00000000938.11,chr1,HAVANA,gene,27612064,27635277,.,-,.,"gene_id ""ENSG00000000938.11""; gene_type ""prote...",protein_coding,FGR,12147
ENSG00000000971.14,chr1,HAVANA,gene,196651878,196747504,.,+,.,"gene_id ""ENSG00000000971.14""; gene_type ""prote...",protein_coding,CFH,15527
ENSG00000001036.12,chr6,HAVANA,gene,143494811,143511690,.,-,.,"gene_id ""ENSG00000001036.12""; gene_type ""prote...",protein_coding,FUCA2,3739
ENSG00000001084.9,chr6,HAVANA,gene,53497341,53616970,.,-,.,"gene_id ""ENSG00000001084.9""; gene_type ""protei...",protein_coding,GCLC,20982
ENSG00000001167.13,chr6,HAVANA,gene,41072945,41099976,.,+,.,"gene_id ""ENSG00000001167.13""; gene_type ""prote...",protein_coding,NFYA,5471


In [158]:
df_gdc[:10]

Unnamed: 0_level_0,name,seqname,start,end,strand,type,gene_status,havana_gene,full_length,exon_length,exon_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000000003.13,TSPAN6,chrX,100627109,100639991,-,protein_coding,KNOWN,OTTHUMG00000022002.1,12883,4535,34
ENSG00000000005.5,TNMD,chrX,100584802,100599885,+,protein_coding,KNOWN,OTTHUMG00000022001.1,15084,1610,10
ENSG00000000419.11,DPM1,chr20,50934867,50958555,-,protein_coding,KNOWN,OTTHUMG00000032742.2,23689,1207,53
ENSG00000000457.12,SCYL3,chr1,169849631,169894267,-,protein_coding,KNOWN,OTTHUMG00000035941.4,44637,6883,63
ENSG00000000460.15,C1orf112,chr1,169662007,169854080,+,protein_coding,KNOWN,OTTHUMG00000035821.7,192074,5967,151
ENSG00000000938.11,FGR,chr1,27612064,27635277,-,protein_coding,KNOWN,OTTHUMG00000003516.1,23214,3474,65
ENSG00000000971.14,CFH,chr1,196651878,196747504,+,protein_coding,KNOWN,OTTHUMG00000035607.5,95627,8145,62
ENSG00000001036.12,FUCA2,chr6,143494811,143511690,-,protein_coding,KNOWN,OTTHUMG00000015728.2,16880,2793,12
ENSG00000001084.9,GCLC,chr6,53497341,53616970,-,protein_coding,KNOWN,OTTHUMG00000160220.5,119630,8463,86
ENSG00000001167.13,NFYA,chr6,41072945,41099976,+,protein_coding,KNOWN,OTTHUMG00000014669.1,27032,3811,19


In [54]:
df_gencode_exons.index[:100]

Index(['ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000003.13', 'ENSG00000000003.13',
       'ENSG00000000003.13', 'ENSG00000000005.5', 'ENSG00000000005.5',
       'ENSG00000000005.5', 'ENSG00000000005.5', 'ENSG00000000005.5',
       'ENSG00000000005.5', 'ENSG00000000005.5', 'ENSG00

In [169]:
df_gencode_exons[df_gencode_exons.index=='ENSG00000000005.5']
#df_gencode_exons[df_gencode_exons.index=='ENSG00000001036.12']

Unnamed: 0_level_0,seqname,1,2,start,end,5,strand,7,8,type,name,exon_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000000005.5,chrX,HAVANA,exon,100594261,100594362,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000485971.1,KNOWN,102
ENSG00000000005.5,chrX,HAVANA,exon,100584802,100585066,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000373031.4,KNOWN,265
ENSG00000000005.5,chrX,HAVANA,exon,100585231,100585362,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000373031.4,KNOWN,132
ENSG00000000005.5,chrX,HAVANA,exon,100593895,100594035,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000373031.4,KNOWN,141
ENSG00000000005.5,chrX,HAVANA,exon,100594261,100594362,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000373031.4,KNOWN,102
ENSG00000000005.5,chrX,HAVANA,exon,100597504,100597657,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000373031.4,KNOWN,154
ENSG00000000005.5,chrX,HAVANA,exon,100599016,100599182,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000373031.4,KNOWN,167
ENSG00000000005.5,chrX,HAVANA,exon,100599508,100599885,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000373031.4,KNOWN,378
ENSG00000000005.5,chrX,HAVANA,exon,100593624,100594035,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000485971.1,KNOWN,412
ENSG00000000005.5,chrX,HAVANA,exon,100597504,100597531,.,+,.,"gene_id ""ENSG00000000005.5""; transcript_id ""EN...",ENST00000485971.1,KNOWN,28


In [74]:
243 - df_gencode_exons[df_gencode_exons.index=='ENSG00000000005.5']['exon_length']

id
ENSG00000000005.5    141
ENSG00000000005.5    -22
ENSG00000000005.5    111
ENSG00000000005.5    102
ENSG00000000005.5    141
ENSG00000000005.5     89
ENSG00000000005.5     76
ENSG00000000005.5   -135
ENSG00000000005.5   -169
ENSG00000000005.5    215
Name: exon_length, dtype: int64

In [162]:
for ivalue, value in enumerate(df_gencode_exons[df_gencode_exons.index=='ENSG00000000005.5'][8]): # ENSGR0000281849.1
#for value in df_gencode_exons[df_gencode_exons.index=='ENSGR0000281849.1'][8]:
#for value in df_gencode_exons[df_gencode_exons.index=='ENSG00000001036.12'][8]:
    print(ivalue, value.split()[22])

0 transcript_support_level
1 protein_id
2 protein_id
3 protein_id
4 protein_id
5 protein_id
6 protein_id
7 protein_id
8 transcript_support_level
9 transcript_support_level


In [176]:
import numpy as np
set(np.concatenate([np.arange(1,5+1), np.arange(4,20+1)]))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}

In [10]:
df_gencode_genes

Unnamed: 0_level_0,seqname,1,2,start,end,5,strand,7,8,type,name,aggregate_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSG00000000003.13,chrX,HAVANA,gene,100627109,100639991,.,-,.,"gene_id ""ENSG00000000003.13""; gene_type ""prote...",protein_coding,TSPAN6,4535
ENSG00000000005.5,chrX,HAVANA,gene,100584802,100599885,.,+,.,"gene_id ""ENSG00000000005.5""; gene_type ""protei...",protein_coding,TNMD,1610
ENSG00000000419.11,chr20,HAVANA,gene,50934867,50958555,.,-,.,"gene_id ""ENSG00000000419.11""; gene_type ""prote...",protein_coding,DPM1,1207
ENSG00000000457.12,chr1,HAVANA,gene,169849631,169894267,.,-,.,"gene_id ""ENSG00000000457.12""; gene_type ""prote...",protein_coding,SCYL3,6883
ENSG00000000460.15,chr1,HAVANA,gene,169662007,169854080,.,+,.,"gene_id ""ENSG00000000460.15""; gene_type ""prote...",protein_coding,C1orf112,5967
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSGR0000275287.3,chrY,ENSEMBL,gene,388100,388389,.,-,.,"gene_id ""ENSGR0000275287.3""; gene_type ""misc_R...",misc_RNA,Metazoa_SRP,290
ENSGR0000276543.3,chrY,ENSEMBL,gene,57209151,57209218,.,+,.,"gene_id ""ENSGR0000276543.3""; gene_type ""miRNA""...",miRNA,AJ271736.1,68
ENSGR0000277120.3,chrY,ENSEMBL,gene,2609191,2609254,.,+,.,"gene_id ""ENSGR0000277120.3""; gene_type ""miRNA""...",miRNA,MIR6089,64
ENSGR0000280767.1,chrY,HAVANA,gene,419157,421980,.,+,.,"gene_id ""ENSGR0000280767.1""; gene_type ""lincRN...",lincRNA,RP13-465B17.5,515


In [11]:
df_gdc

Unnamed: 0_level_0,name,seqname,start,end,strand,type,gene_status,havana_gene,full_length,exon_length,exon_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000000003.13,TSPAN6,chrX,100627109,100639991,-,protein_coding,KNOWN,OTTHUMG00000022002.1,12883,4535,34
ENSG00000000005.5,TNMD,chrX,100584802,100599885,+,protein_coding,KNOWN,OTTHUMG00000022001.1,15084,1610,10
ENSG00000000419.11,DPM1,chr20,50934867,50958555,-,protein_coding,KNOWN,OTTHUMG00000032742.2,23689,1207,53
ENSG00000000457.12,SCYL3,chr1,169849631,169894267,-,protein_coding,KNOWN,OTTHUMG00000035941.4,44637,6883,63
ENSG00000000460.15,C1orf112,chr1,169662007,169854080,+,protein_coding,KNOWN,OTTHUMG00000035821.7,192074,5967,151
...,...,...,...,...,...,...,...,...,...,...,...
ENSGR0000275287.3,Metazoa_SRP,chrY,388100,388389,-,misc_RNA,NOVEL,,290,290,1
ENSGR0000276543.3,AJ271736.1,chrY,57209151,57209218,+,miRNA,NOVEL,,68,68,1
ENSGR0000277120.3,MIR6089,chrY,2609191,2609254,+,miRNA,KNOWN,,64,64,1
ENSGR0000280767.1,RP13-465B17.5,chrY,419157,421980,+,lincRNA,NOVEL,OTTHUMG00000189993.1,2824,515,2
