# This notebook shows how to prepare custom genome coordinate for versions/species that is not currently supported. Or to integrate outside database information

In [1]:
from pybedtools import BedTool
import pandas as pd
import os

In [2]:
gencode = BedTool('/projects/ps-yeolab3/bay001/annotations/ChlSab2/Chlorocebus_sabaeus.ChlSab1.1.102.gtf')

In [3]:
# these functions may need to be modified depending on what file format your are inputting
  
import pickle

def first_last_exon_cds(all_dict):
    ''' find first last exon and cds'''
    
    for key in all_dict.keys():
        features = all_dict[key]['features']
        
        #### exon
        if 'exon' in features.keys():
            min_start = min([e[0] for e in list(features['exon'])])
            max_start = max([e[1] for e in list(features['exon'])])
        
            if all_dict[key]['strand'] == '+':
                features['first_exon'] = set([e for e in list(features['exon']) if e[0] == min_start])
                features['last_exon'] = set([e for e in list(features['exon']) if e[1] == max_start])
            else:
                features['last_exon'] = set([e for e in list(features['exon']) if e[0] == min_start])
                features['first_exon'] = set([e for e in list(features['exon']) if e[1] == max_start])
        
            features['exon'] = features['exon'] - features['first_exon'] - features['last_exon']
        else:
            #del all_dict[key] # no exon, no need to keep
            pass
        ## CDS
        if 'CDS' in features.keys():
            min_start = min([e[0] for e in list(features['CDS'])])
            max_start = max([e[1] for e in list(features['CDS'])])
        
            if all_dict[key]['strand'] == '+':
                features['first_CDS'] = set([e for e in list(features['CDS']) if e[0] == min_start])
                features['last_CDS'] = set([e for e in list(features['CDS']) if e[1] == max_start])
            else:
                features['last_CDS'] = set([e for e in list(features['CDS']) if e[0] == min_start])
                features['first_CDS'] = set([e for e in list(features['CDS']) if e[1] == max_start])
        
            features['CDS'] = features['CDS'] - features['first_CDS'] - features['last_CDS']
        
def five_three_utr(all_dict):
    ''' hg19 annotations don't have UTR'''
    for key in all_dict.keys():
        features = all_dict[key]['features']
        
        #### exon
        if 'UTR' in features.keys():
            min_start = min([e[0] for e in list(features['UTR'])])
            max_start = max([e[1] for e in list(features['UTR'])])
        
            if all_dict[key]['strand'] == '+':
                features['five_prime_UTR'] = set([e for e in list(features['UTR']) if e[0] == min_start])
                features['three_prime_UTR'] = set([e for e in list(features['UTR']) if e[1] == max_start])
            else:
                features['five_prime_UTR'] = set([e for e in list(features['UTR']) if e[1] == max_start])
                features['three_prime_UTR'] = set([e for e in list(features['UTR']) if e[0] == min_start])
        
            del features['UTR']


        
def build_transcript_dict(gencode = gencode, outdir = '/home/hsher/Metadensity/metadensity/data/ChlSab'):
    ''' extract gencode coordinate and save in data'''
    
    # make a directory for every coordinate
    annotation_path = outdir
    if not os.path.exists(annotation_path):
        os.mkdir(annotation_path)
    
    all_dict = {}
    
    gene = gencode.filter(lambda x: x[2] == 'gene')
    transcript = gencode.filter(lambda x: x[2] == 'transcript')
    exons = gencode.filter(lambda x: x[2] == 'exon')
    intron = transcript.subtract(exons, s = True).saveas()
    print('extracting from gencode annotations')
    for g in gencode:
        feature_type = g[2]
        
        if feature_type == 'gene':
            # start a new dict
            all_dict[g.attrs['gene_id']] = {} # ENST or ENSG
            all_dict[g.attrs['gene_id']]['chrom'] = 'chr'+g.chrom # NEED TO DO THE SAME FOR GENES/TRANSCRIPTS TOO; NEED TO CONSISTENT WITH CLIP PIPELINE
            all_dict[g.attrs['gene_id']]['start'] = g.start
            all_dict[g.attrs['gene_id']]['end'] = g.end
            all_dict[g.attrs['gene_id']]['strand'] = g.strand
            all_dict[g.attrs['gene_id']]['id'] = g.attrs['gene_id']
            all_dict[g.attrs['gene_id']]['type'] = g.attrs['gene_biotype']
            try:
                all_dict[g.attrs['gene_id']]['name'] = g.attrs['gene_name']
            except:
                all_dict[g.attrs['gene_id']]['name'] = ''
            all_dict[g.attrs['gene_id']]['features'] = {} # start to contain stuffs
        else:
             # doesn't always equal to ID, for X,Y chromosome genes
            gene_id = g.attrs['gene_id']
            
            for ids in [gene_id]:
                target_dict = all_dict[ids] ############## YOU NEED TO MAKE SURE THE FEATURE NAMES ARE IN METAGENE ###########
                if feature_type == 'five_prime_utr':
                    feature_type = 'five_prime_UTR'
                if feature_type == 'three_prime_utr':
                    feature_type = 'three_prime_UTR'
                if feature_type not in target_dict['features'].keys():
                    target_dict['features'][feature_type] = set()
                target_dict['features'][feature_type].update([(g.start, g.end)])
    print('building intron')
    for i in intron:
        feature_type = 'intron'
        
        gene_id = i.attrs['gene_id']
            
        for ids in [gene_id]:
            target_dict = all_dict[ids]
            if feature_type not in target_dict['features'].keys():
                target_dict['features'][feature_type] = set()
            target_dict['features'][feature_type].update([(i.start, i.end)])
    
    print('building first last exon/cds')
    first_last_exon_cds(all_dict)
    print('building five/three prime UTR')
    five_three_utr(all_dict)
    print('writing to directory')
    with open(os.path.join(annotation_path, 'gencode'), 'wb') as f:
        pickle.dump(all_dict, f)
    return all_dict
            
            
            
        

In [4]:
d = build_transcript_dict()

extracting from gencode annotations
building intron
building first last exon/cds
building five/three prime UTR
writing to directory


In [5]:
d['ENSCSAG00000017073']

{'chrom': 'chr8',
 'start': 183179,
 'end': 246703,
 'strand': '+',
 'id': 'ENSCSAG00000017073',
 'type': 'protein_coding',
 'name': 'FBXO25',
 'features': {'transcript': {(183179, 246703)},
  'exon': {(184546, 184671),
   (189275, 189416),
   (209060, 209164),
   (210598, 210648),
   (213557, 213650),
   (228044, 228138),
   (229299, 229484),
   (236307, 236490),
   (240928, 241072),
   (245629, 245656)},
  'CDS': {(209060, 209164),
   (210598, 210648),
   (213557, 213650),
   (228044, 228138),
   (229299, 229484),
   (236307, 236490),
   (240928, 241072),
   (245629, 245656)},
  'start_codon': {(189282, 189285)},
  'five_prime_UTR': {(183179, 183285), (184546, 184671), (189275, 189282)},
  'intron': {(183285, 184546),
   (184671, 189275),
   (189416, 209060),
   (209164, 210598),
   (210648, 213557),
   (213650, 228044),
   (228138, 229299),
   (229484, 236307),
   (236490, 240928),
   (241072, 245629),
   (245656, 246616)},
  'first_exon': {(183179, 183285)},
  'last_exon': {(246616

# And you also need to edit the `metadensity.config file`.

adding this:

```'ChlSab':{
                'gencode': 'Chlor.gtf',
                'transcript': 'Chlor.genes.gtf',
                'gencode_feature': 'Chlor.combined.sorted.gtf',
                ```
                
to self.file_dict


these files are processed by `/home/hsher/project/covid-clip/notebook/parse_coord.sh`
just to seperate the genes, and add chr to the front.

code:
```
awk '{ if ($3 == "gene"){ print } }' /projects/ps-yeolab3/bay001/annotations/ChlSab2/Chlorocebus_sabaeus_Sars_cov_2.gtf > ~/gencode_coords/Chlor.genes.gtf

sed -i -e 's/^/chr/' ~/gencode_coords/Chlor.genes.gtf

awk '{ if ($3 != "gene" && $3 != "transcript"){ print } }' /projects/ps-yeolab3/bay001/annotations/ChlSab2/Chlorocebus_sabaeus_Sars_cov_2.gtf > ~/gencode_coords/Chlor.combined.sorted.gtf

sed -i -e 's/^/chr/' ~/gencode_coords/Chlor.combined.sorted.gtf
```

Then reinstall!