In [1]:
import pandas as pd
from pybedtools import BedTool

### Information about BED Files (genome.ucsc.edu)

The BED (Browser Extensible Data) format is a text file format used to store genomic regions as coordinates and associated annotations. The data are presented in the form of columns separated by spaces or tabs. (Wikipedia)

The first three required BED fields are:

- **chrom** - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671).
- **chromStart** - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
- **chromEnd** - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature, however, the number in position format will be represented. For example, the first 100 bases of chromosome 1 are defined as chrom=1, chromStart=0, chromEnd=100, and span the bases numbered 0-99 in our software (not 0-100), but will represent the position notation chr1:1-100.


In [3]:
rbp = pd.read_excel('~/projects/ClipNet/archishma/mmc2.xlsx', sheet_name = 'CandidateRBDpep')
rbp.head()

Unnamed: 0,ENSG,ProtID,Symbol,MS-identified peptide,Start,Stop,category,Uniqueness,domain,enzyme,LysC/ArgC proteolytic fragment,fragmentStart,fragmentStop
0,ENSG00000132541,P52758,HRSP12,AAGCDFTNVVK,68.0,78.0,CandidateRBDpep,UniqueGene,other,LysC,AAGCDFTNVVK,68.0,78.0
1,ENSG00000112651,Q5T653,MRPL2,AAGTCGVLLR,209.0,218.0,CandidateRBDpep,UniqueGene,other,LysC,RWIIATENMQAGDTILNSNHIGRMAVAAREGDAHPLGALPVGTLIN...,149.0,219.0
2,ENSG00000114867,Q04637,EIF4G1,AALSEEELEKK,1235.0,1245.0,CandidateRBDpep,UniqueGene,other,LysC,AALSEEELEKKSKAIIEEYLHLNDMK,1235.0,1260.0
3,ENSG00000167112,O95900,TRUB2,AATPQVAAELEK,285.0,296.0,CandidateRBDpep,UniqueGene,other,LysC,TTAVCTQVRRTRDGFFTLDSALLRTQWDLTNIQDAIRAATPQVAAELEK,248.0,296.0
4,ENSG00000040341,Q9NUL3,STAU2,AATTVLQELK,264.0,273.0,CandidateRBDpep,UniqueGene,classical,LysC,KLSKKRAATTVLQELK,258.0,273.0


- genes of interest: EIF3I, SF3A1
- transcript_coords has information about transcripts
- region_anno has information about regions, localization, etc 

In [4]:
transcript_coords = BedTool('/home/hsher/gencode_coords/gencode.v33.transcript.gff3')
region_anno = BedTool('/home/hsher/gencode_coords/gencode.v33.combine.sorted.gff3')

In [5]:
from collections import Counter

In [6]:
encode3 = pd.read_csv('/home/hsher/projects/RBP_annot/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifesthg38.txt', sep = '\t', header= 0)
encode4 = pd.read_csv('/home/hsher/projects/ClipNet/archishma/ENCODE4_1214.csv')

In [7]:
encode3.loc[encode3['RBP'].str.contains('SF3A1')]

Unnamed: 0,uID,RBP,Cell line,CLIP_rep1,CLIP_rep2,INPUT


In [8]:
encode4.loc[encode4['RBP'].str.contains('SF3A1')]

Unnamed: 0.1,Unnamed: 0,uid,Batch,RBP,prefix,bam_0,bam_1,bam_control,plus_0,plus_1,plus_control,minus_0,minus_1,minus_control,bed_0,bed_1,idr,Cell Line
29,29,4048,batch12,SF3A1,encode4_batch12.4048,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,K562
48,48,4117,batch17,SF3A1,encode4_batch17.4117,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,/projects/ps-yeolab5/encore/processing/encore_...,HepG2


In [9]:
# idr has the path to the files
# look at the files - which region, type of transcript 
bed_SF3A1_4048 = encode4.loc[29,'idr']
bed_SF3A1_4117 = encode4.loc[48,'idr']

### Information about SF3A1 (UID 4048):
- Batch 12
- Cell Line K562

In [10]:
SF3A1_idr_4048 = BedTool(bed_SF3A1_4048)

In [28]:
SF3A1_idr_4048.head()

chrX	54561037	54561072	14.91276321091	5.47432261299678	+
 chrX	54561012	54561037	4.25258368604036	4.8528943510487	+
 chr8	38347553	38347555	11.9290630586538	5.74959512094666	-
 chr8	38347555	38347654	15.9100115685067	5.58678383172411	-
 chr8	36764485	36764496	14.9860720851438	5.26756004952068	+
 chr11	61799919	61799960	3.28840161983509	3.26393033899278	-
 chr1	10459933	10459965	8.73529170854046	4.97852559094349	-
 chr1	10459965	10460008	7.02086426445287	4.73611298950637	-
 chr1	10459818	10459890	4.4048029271012	4.55347330618502	-
 chr1	10459891	10459933	3.47982195640752	4.33738090323839	-
 

In [11]:
# counter for how many chromosomes SF3A1_4048 appears on
Counter([t.chrom for t in SF3A1_idr_4048])

Counter({'chrX': 16,
         'chr8': 34,
         'chr11': 28,
         'chr1': 85,
         'chr3': 49,
         'chr18': 10,
         'chr10': 23,
         'chr9': 20,
         'chr7': 30,
         'chr17': 42,
         'chr19': 47,
         'chr4': 17,
         'chr16': 17,
         'chr5': 27,
         'chr14': 25,
         'chr20': 14,
         'chr6': 55,
         'chr2': 21,
         'chr12': 36,
         'chr21': 4,
         'chr13': 4,
         'chr22': 7,
         'chr15': 9,
         'chrM': 1})

In [33]:
# once again, using transcript_coords, can intersect to find hits 
transcriptsbound_SF3A1_4048 = transcript_coords.intersect(SF3A1_idr_4048, s= True, u = True).saveas()

In [34]:
len(transcriptsbound_SF3A1_4048)

303

In [35]:
print(transcriptsbound_SF3A1_4048[0]) # so each of these is a transcript, has attributes (ie protein coding, etc)

chr1	ENSEMBL	transcript	1785285	1891117	.	-	.	ID=ENST00000610897.4;Parent=ENSG00000078369.18;gene_id=ENSG00000078369.18;transcript_id=ENST00000610897.4;gene_type=protein_coding;gene_name=GNB1;transcript_type=protein_coding;transcript_name=GNB1-209;level=3;protein_id=ENSP00000481878.1;transcript_support_level=5;hgnc_id=HGNC:4396;tag=basic,appris_principal_1,CCDS;ccdsid=CCDS34.1;havana_gene=OTTHUMG00000000940.9



In [36]:
#since counter is already imported-- use it to see how many there are of each type of transcript. 
Counter([a.attrs['transcript_type'] for a in transcriptsbound_SF3A1_4048])

Counter({'protein_coding': 248,
         'lncRNA': 14,
         'snRNA': 1,
         'rRNA_pseudogene': 22,
         'nonsense_mediated_decay': 8,
         'snoRNA': 5,
         'rRNA': 1,
         'processed_transcript': 1,
         'scaRNA': 1,
         'processed_pseudogene': 1,
         'Mt_rRNA': 1})

In [37]:
# which regions is it bound?
regionsbound_SF3A1_4048 = region_anno.intersect(SF3A1_idr_4048, s = True, u = True).saveas()

In [49]:
# just an example with the first one-- the "index 2" shows the region it's bound in
print(regionsbound_SF3A1_4048[0])

chr1	ENSEMBL	exon	1825397	1825499	.	-	.	ID=exon:ENST00000610897.4:2;Parent=ENST00000610897.4;gene_id=ENSG00000078369.18;transcript_id=ENST00000610897.4;gene_type=protein_coding;gene_name=GNB1;transcript_type=protein_coding;transcript_name=GNB1-209;exon_number=2;exon_id=ENSE00003268669.1;level=3;protein_id=ENSP00000481878.1;transcript_support_level=5;hgnc_id=HGNC:4396;tag=basic,appris_principal_1,CCDS;ccdsid=CCDS34.1;havana_gene=OTTHUMG00000000940.9



In [50]:
regionsbound_SF3A1_4048[0][2]

'exon'

In [51]:
# count all the regions it's bound in
Counter([r[2] for r in regionsbound_SF3A1_4048]) 

Counter({'exon': 302,
         'five_prime_UTR': 23,
         'transcript': 95,
         'CDS': 199,
         'three_prime_UTR': 54})

### Information about SF3A1 (UID 4117):
- Batch 17
- Cell Line: HepG2

In [52]:
SF3A1_idr_4117 = BedTool(bed_SF3A1_4117)

In [53]:
SF3A1_idr_4117.head()

chr17	81125900	81125909	54.3957434825502	7.67249447792878	-
 chr17	81125885	81125892	54.3957434825502	7.67249447792878	-
 chr17	64500851	64500891	9.85732120492153	3.57043658762762	-
 chr17	64500826	64500851	13.1033966304457	3.51305172634691	-
 chr17	64500807	64500826	10.4347899293002	3.27655941500224	-
 chr10	114174150	114174173	3.50025629786793	4.05989075945703	-
 chr21	44524048	44524240	19.5968297008866	5.52065735146499	+
 chr17	82239311	82239411	10.0105412829474	3.66502069254965	+
 chr19	16575920	16575949	4.44667366365153	3.87682681005808	-
 chr17	82671271	82671341	7.47674531845525	4.312602627325	-
 

In [47]:
print(SF3A1_idr_4117[0])

chr17	81125900	81125909	54.3957434825502	7.67249447792878	-



In [55]:
# attributes of the very first item in the "array"
print(SF3A1_idr_4117[0].chrom)
print(SF3A1_idr_4117[0].start)
print(SF3A1_idr_4117[0].end)

chr17
81125900
81125909


In [56]:
# counting all the chromosomes it's on
Counter([t.chrom for t in SF3A1_idr_4048])

Counter({'chrX': 16,
         'chr8': 34,
         'chr11': 28,
         'chr1': 85,
         'chr3': 49,
         'chr18': 10,
         'chr10': 23,
         'chr9': 20,
         'chr7': 30,
         'chr17': 42,
         'chr19': 47,
         'chr4': 17,
         'chr16': 17,
         'chr5': 27,
         'chr14': 25,
         'chr20': 14,
         'chr6': 55,
         'chr2': 21,
         'chr12': 36,
         'chr21': 4,
         'chr13': 4,
         'chr22': 7,
         'chr15': 9,
         'chrM': 1})

In [57]:
# using transcript_coords, can intersect to find hits 
transcriptsbound_SF3A1_4117 = transcript_coords.intersect(SF3A1_idr_4117, s= True, u = True).saveas()

In [60]:
len(transcriptsbound_SF3A1_4117)

1103

In [58]:
print(transcriptsbound_SF3A1_4117[0]) # each of these is a transcript with features, etc

chr1	HAVANA	transcript	594308	827796	.	-	.	ID=ENST00000634337.2;Parent=ENSG00000230021.10;gene_id=ENSG00000230021.10;transcript_id=ENST00000634337.2;gene_type=transcribed_processed_pseudogene;gene_name=AL669831.3;transcript_type=processed_transcript;transcript_name=AL669831.3-208;level=2;transcript_support_level=5;tag=RNA_Seq_supported_only,basic;havana_gene=OTTHUMG00000191652.4;havana_transcript=OTTHUMT00000488647.3



In [59]:
# how many there are of each type of transcript
Counter([a.attrs['transcript_type'] for a in transcriptsbound_SF3A1_4117])

Counter({'processed_transcript': 4,
         'miRNA': 224,
         'protein_coding': 775,
         'lncRNA': 72,
         'snRNA': 3,
         'retained_intron': 3,
         'nonsense_mediated_decay': 13,
         'unprocessed_pseudogene': 1,
         'rRNA_pseudogene': 3,
         'snoRNA': 3,
         'misc_RNA': 1,
         'Mt_tRNA': 1})

In [61]:
# using region_anno to see where the regions are
regionsbound_SF3A1_4117 = region_anno.intersect(SF3A1_idr_4117, s = True, u = True).saveas()

In [64]:
print(regionsbound_SF3A1_4117[0]) # each of these has region information

chr1	HAVANA	transcript	819838	826205	.	-	.	ID=ENST00000634337.2;Parent=ENSG00000230021.10;gene_id=ENSG00000230021.10;transcript_id=ENST00000634337.2;gene_type=transcribed_processed_pseudogene;gene_name=AL669831.3;transcript_type=processed_transcript;transcript_name=AL669831.3-208;level=2;transcript_support_level=5;tag=RNA_Seq_supported_only,basic;havana_gene=OTTHUMG00000191652.4;havana_transcript=OTTHUMT00000488647.3



In [65]:
Counter([r[2] for r in regionsbound_SF3A1_4117]) 

Counter({'transcript': 382,
         'exon': 816,
         'CDS': 267,
         'five_prime_UTR': 56,
         'three_prime_UTR': 260})