In [1]:
# find HepG2 essential genes
import pandas as pd
from pybedtools import BedTool
dep_score = pd.read_csv('D2_DRIVE_gene_dep_scores.csv')
# first we need to split the names out!
dep_score['Symbol'] = dep_score['Unnamed: 0'].str.split(' ', expand = True)[0]
dep_score.drop('Unnamed: 0', inplace = True, axis = 1)
dep_score.set_index('Symbol', inplace = True)

hepg2 = dep_score[['HEPG2_LIVER']]
# hepg2 targets
targets = hepg2.loc[hepg2['HEPG2_LIVER']<-1].index

# there's that & that compose of two genes
essential_gene = []
for t in targets:
    if '&' in t:
        essential_gene.extend(t.split('&'))
        print(t)
    else:
        essential_gene.append(t)


U2AF1L5&U2AF1
GPR89B&GPR89A


In [2]:
essential_gene # those are symbols
# we don't need to filter for ENCODE because, because these are the transcripts bound! ENCODE entries are RBPs, and we are asking, is a RBP binding the "RNA" of essential genes

['DCAF13P3',
 'SMC4',
 'ATP6V1G2-DDX39B',
 'ALYREF',
 'SF3B4',
 'U2AF1L5',
 'U2AF1',
 'SF3A1',
 'RACK1',
 'PPIE',
 'TACC3',
 'NXF1',
 'CCT7',
 'CCT2',
 'SMC2',
 'TBL3',
 'USP39',
 'RUVBL2',
 'WDR3',
 'AFG3L2',
 'SF3B2',
 'RPL35',
 'U2AF2',
 'AP2M1',
 'CLNS1A',
 'CLTC',
 'NEDD1',
 'COPA',
 'CTNNB1',
 'CYP21A1P',
 'DDB1',
 'DHX8',
 'DHX15',
 'DYNC1I2',
 'DUSP9',
 'EEF2',
 'COPZ1',
 'PUF60',
 'SNRNP200',
 'BOP1',
 'SF3B1',
 'NR5A1',
 'XRCC6',
 'GAK',
 'GEMIN5',
 'PRPF31',
 'HTATSF1',
 'PRPF19',
 'NOP53',
 'HNRNPC',
 'HNRNPU',
 'HSPE1',
 'IK',
 'KIF11',
 'SBK1',
 'MCM6',
 'CRSP8P',
 'NACA',
 'NRAS',
 'ACO2',
 'PABPC3',
 'DDX47',
 'CDC40',
 'DDX41',
 'GPR89B',
 'GPR89A',
 'SF3B6',
 'ATP6V0C',
 'POLA1',
 'POLR2A',
 'POLR2B',
 'POLR2J',
 'INO80',
 'SMU1',
 'PPP6C',
 'RBM22',
 'PSMA3',
 'PSMA4',
 'PSMC1',
 'PSMC2',
 'PSMC3',
 'PSMC5',
 'PSMD2',
 'NAT14',
 'RBM25',
 'RAN',
 'ABCE1',
 'RPA1',
 'RPA2',
 'RPL7',
 'RPS16',
 'RPS18',
 'RPS21',
 'RPS27A',
 'RRM1',
 'S100A1',
 'SNRNP70',
 'SNRPB',
 'S

# Find the coordinate for essential genes

In [3]:
# load transcripts
# I will use my annotations here, the full and unparsed version!
transcript_coords = BedTool('/home/hsher/gencode_coords/gencode.v33.transcript.gff3')

In [4]:
len(transcript_coords)

60134

In [5]:
print(transcript_coords[0]) # where does the "gene symbol appear?"

chr1	HAVANA	transcript	11869	14409	.	+	.	ID=ENST00000456328.2;Parent=ENSG00000223972.5;gene_id=ENSG00000223972.5;transcript_id=ENST00000456328.2;gene_type=transcribed_unprocessed_pseudogene;gene_name=DDX11L1;transcript_type=processed_transcript;transcript_name=DDX11L1-202;level=2;transcript_support_level=1;hgnc_id=HGNC:37102;tag=basic;havana_gene=OTTHUMG00000000961.2;havana_transcript=OTTHUMT00000362751.1



In [6]:
transcript_coords[0].attrs['transcript_name']

'DDX11L1-202'

bedtool has this filter function that is really easy to use
https://daler.github.io/pybedtools/filtering.html

also check out lambda function in python

In [7]:
# so we will be filtering based on 'transcript_name'
essential_gene_coords = transcript_coords.filter(lambda t: t.attrs['transcript_name'] in essential_gene).saveas()

In [8]:
len(essential_gene_coords) # no hit, what's going on?

0

In [9]:
[t.attrs['transcript_name'] for t in transcript_coords if 'ALYREF' in t.attrs['transcript_name'] ]

['ALYREF-202']

In [10]:
[t.attrs['transcript_name'] for t in transcript_coords if 'RPRF4' in t.attrs['transcript_name'] ]

[]

In [11]:
[t.attrs['transcript_name'] for t in transcript_coords if 'SF3A1' in t.attrs['transcript_name'] ]
# seems that they are named differently..... what can we do?

['SF3A1-201']

In [12]:
# convert gene symbol to ensembl transcript IDs.
# look at the gene symbol conversion notebooks
import mygene
mg = mygene.MyGeneInfo()
converted = mg.querymany(essential_gene, scopes='symbol',
    fields='ensembl.gene,entrezgene', as_dataframe=True, species = 9606)

querying 1-141...done.
Finished.
6 input query terms found dup hits:
	[('DCAF13P3', 2), ('ATP6V1G2-DDX39B', 2), ('CYP21A1P', 5), ('BOP1', 2), ('UTP4', 2), ('RUVBL1', 2)]
2 input query terms found no hit:
	['VARS', 'HIST1H2BO']
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


  df = json_normalize(obj)


In [13]:
converted.head() # let's use essential_gene as the "Parent" in attrs

Unnamed: 0_level_0,_id,_score,ensembl.gene,entrezgene,ensembl,notfound
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DCAF13P3,ENSG00000259378,38.68567,ENSG00000259378,,,
DCAF13P3,100132724,38.68567,,100132724.0,,
SMC4,10051,86.75188,ENSG00000113810,10051.0,,
ATP6V1G2-DDX39B,100532737,113.925354,,100532737.0,,
ATP6V1G2-DDX39B,ENSG00000254870,112.542305,ENSG00000254870,,,


In [14]:
essential_gene_ids = converted['ensembl.gene'].dropna().tolist()
essential_gene_coords = transcript_coords.filter(lambda x: x.attrs['Parent'].split('.')[0] in essential_gene_ids).saveas()

In [15]:
len(essential_gene_ids)

137

In [16]:
len(essential_gene_coords) # now we have where the essential gene RNA lies.

128

# Then we can look at some IDR genes of essential RBP
- first see what we have with encode data

In [17]:
# with encode data
encode4 = pd.read_csv('/home/hsher/projects/ClipNet/archishma/ENCODE4_1214.csv')
encode3 = pd.read_csv('/home/hsher/projects/RBP_annot/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifesthg38.txt', sep = '\t', header= 0)

In [18]:
encode4.loc[encode4['RBP'].isin(essential_gene)]

Unnamed: 0.1,Unnamed: 0,uid,Batch,RBP,prefix,bam_0,bam_1,bam_control,plus_0,plus_1,plus_control,minus_0,minus_1,minus_control,bed_0,bed_1,idr,Cell Line
17,17,4037,batch11a,EEF2,encode4_batch11a.4037,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,K562
22,22,4050,batch11b,DDX47,encode4_batch11b.4050,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,K562
29,29,4048,batch12,SF3A1,encode4_batch12.4048,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,K562
43,43,4107,batch17,EEF2,encode4_batch17.4107,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,HepG2
48,48,4117,batch17,SF3A1,encode4_batch17.4117,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,HepG2


In [19]:
encode3.loc[encode3['RBP'].isin(essential_gene)]

Unnamed: 0,uID,RBP,Cell line,CLIP_rep1,CLIP_rep2,INPUT
0,203,HNRNPC,HepG2,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...
15,228,SF3B4,HepG2,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...
21,242,U2AF2,K562,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...
23,244,U2AF1,K562,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...
26,249,HNRNPU,K562,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...
32,272,U2AF2,HepG2,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...
36,281,HNRNPU,HepG2,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...
37,282,U2AF1,HepG2,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...
47,311,XRCC6,HepG2,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...
54,331,RBM22,K562,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...,/projects/ps-yeolab3/encode/analysis/encode_GR...


In [20]:
# let's demo using RBM22
uid = encode3.loc[(encode3['RBP']=='RBM22')&(encode3['Cell line']=='HepG2'), 'uID'].iloc[0]

In [21]:
# encode 3 idr peaks are here
idr_path = '/home/hsher/seqdata/eclip_bed/sorted/{}.01v02.IDR.out.0102merged.bed.blacklist_removed.bed.narrowPeak.bed'.format(uid)

In [22]:
idr = BedTool(idr_path) # ok now we have the binding regions ready

# Our question:
does RBP (ex: RBM22) regulates tumor-essential genes more likely than other genes

we need find these 4 groups:
1. tumor essential AND bound by RBM22
2. tumor essential AND NOT bound by RBM22
3. NOT tumor essential AND bound by RBM22
4. NOT tumor essential AND NOT bound by RBM22
Then run fisher exact test.

we will use bedtool intersect to figure out these 4 groups. 
## Building table

In [23]:
group_1 = essential_gene_coords.intersect(idr, s = True, u = True).saveas() #tumor essential AND bound by SF3A1
group_2 = essential_gene_coords.intersect(idr, s = True, v = True).saveas() #tumor essential AND NOT bound by SF3A1

In [24]:
# now we need to find transcripts that is not tumor essential genes, to prepare for group 3 and 4
nonessential_gene_coords = transcript_coords.intersect(essential_gene_coords, s = True, v = True).saveas()

In [25]:
# then
group_3 = nonessential_gene_coords.intersect(idr, s = True, u = True).saveas() #NOT tumor essential AND bound by SF3A1
group_4 = nonessential_gene_coords.intersect(idr, s = True, v = True).saveas() #NOT tumor essential AND NOT bound by SF3A1 Then run fisher exact test.

In [29]:
# okay let's count how many genes there are for our table
table_for_stat = pd.DataFrame([
                                [len(group_1), len(group_2)],
                                [len(group_3), len(group_4)]
                                ], 
                              columns = ['bound by RBM22', 'not bound'],
                              index = ['is essential', 'not essential']
                             )

In [30]:
table_for_stat

Unnamed: 0,bound by RBM22,not bound
is essential,26,102
not essential,1244,58719


# With the table, we are ready for stats
- check out this documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html
- and odds ratio: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2938757/


In [31]:
from scipy.stats import fisher_exact

odds, p = fisher_exact(table_for_stat, alternative = 'two-sided') 

In [32]:
odds

12.031823340268584

In [34]:
(26/1244)/(102/58719) # this is how odds ratio is calculated

12.031823340268582

In [33]:
p # pretty significant

2.793388556985412e-18

In [35]:
# so here we can conclude that tumor essential genes are regulated by RBM22 (have IDR peaks) more often than expected