## Finding index and length of frameshift

08/04/2016: this code is going over the exon sequences I received from Shilpa, which correspond to the HMMER results.
In some cases, Shilpa noted a frameshift, meaning, 1 or two bases were missing in a paticular location, and those need to be
added in order to get the correct translation to amino-acids.
The code here find all those cases and retreive the filename along with the fraemshift index and the length (1 or 2) of bases
needed to be added for the correct translation.

The code then save the results as a pickled dicionary called: "exons_index_length.pik", 
and also in a readable table called: "exons_index_length_table.csv".

In [1]:
import fileinput
import sys
import pickle
import pandas as pd
from collections import defaultdict

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>");

In [2]:
curr_dir = !pwd
my_path = curr_dir[0]+"/from_shilpa/exons_seqs/"

#Getting all the chromosome names, including the ones with patch name
chromosome_names = !ls $my_path

In [3]:
%%time

frameshifts_dict = defaultdict(list)
for chrom in chromosome_names:
    chrom_dir = my_path+chrom+"/"
    chrom_files = !ls $chrom_dir
    for gene_dir in chrom_files:
        exons_files = !ls $chrom_dir$gene_dir
        for f in exons_files:
            curr_index_list = []
            curr_length_dict = {}
            curr_bps_dict = {}
            for line in fileinput.input(chrom_dir+"/"+gene_dir+"/"+f):
                
                #Getting exons data from the first line
                if (line.find("chromosome") >= 0):
                    chrom_raw_data = line[line.find("GRCh37"):line.find("length")-1]
                    #Removing the complement bracates if exist
                    if (chrom_raw_data.find("complement(") >= 0):
                        chrom_raw_data = chrom_raw_data[chrom_raw_data.find("complement(")+11:-1]
                        #Removing the join bracates if exist
                    if (chrom_raw_data.find("join(") >= 0):
                        chrom_raw_data = chrom_raw_data[chrom_raw_data.find("join(")+5:-1]
                    
                    #Saving indices of frameshifts, if exist
                    exons_list = chrom_raw_data.split(",")
                    for exon in exons_list:
                        if (exon[0] == "-"):
                            curr_index_list.append(int(exon[1:exon.find("..")]))
                
                #Getting the frameshift length from another line
               
                for idx in curr_index_list:
                    if (line.find("-"+str(idx)+":-"+str(idx)) == 0):
                        curr_length_dict[idx] = len(line.split("\t")[1][:-1])
                        curr_bps_dict[idx] = line.split("\t")[1][:-1]
        
            #After interating all the lines: saving index and length information
            for idx in curr_index_list:
                frameshifts_dict[f].append((idx, curr_length_dict[idx], curr_bps_dict[idx]))
            
            fileinput.close()
        
    print "Finished Chromosome "+chrom
    
with open(curr_dir[0]+"/domains_frameshifts/exons_index_length.pik", 'wb') as handle:
    pickle.dump(frameshifts_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Finished Chromosome 1
Finished Chromosome 10
Finished Chromosome 11
Finished Chromosome 12
Finished Chromosome 13
Finished Chromosome 14
Finished Chromosome 15
Finished Chromosome 16
Finished Chromosome 17
Finished Chromosome 18
Finished Chromosome 19
Finished Chromosome 2
Finished Chromosome 20
Finished Chromosome 21
Finished Chromosome 22
Finished Chromosome 3
Finished Chromosome 4
Finished Chromosome 5
Finished Chromosome 6
Finished Chromosome 7
Finished Chromosome 8
Finished Chromosome 9
Finished Chromosome GL000191.1
Finished Chromosome GL000192.1
Finished Chromosome GL000193.1
Finished Chromosome GL000194.1
Finished Chromosome GL000195.1
Finished Chromosome GL000201.1
Finished Chromosome GL000204.1
Finished Chromosome GL000205.1
Finished Chromosome GL000209.1
Finished Chromosome GL000212.1
Finished Chromosome GL000213.1
Finished Chromosome GL000215.1
Finished Chromosome GL000218.1
Finished Chromosome GL000219.1
Finished Chromosome GL000221.1
Finished Chromosome GL000222.1
Finishe

In [4]:
frameshifts_dict

defaultdict(list,
            {'ENSP00000462331.1.exons.txt': [(431, 2, 'TC')],
             'ENSP00000469811.1.exons.txt': [(1, 2, 'NN')],
             'ENSP00000447167.1.exons.txt': [(1, 1, 'N')],
             'ENSP00000473364.1.exons.txt': [(1, 1, 'N')],
             'ENSP00000408902.1.exons.txt': [(1, 1, 'N')],
             'ENSP00000457018.1.exons.txt': [(78, 1, 'C')],
             'ENSP00000452125.1.exons.txt': [(1, 1, 'N')],
             'ENSP00000409925.1.exons.txt': [(1, 2, 'NN')],
             'ENSP00000453612.1.exons.txt': [(497, 2, 'CA')],
             'ENSP00000411574.1.exons.txt': [(1, 1, 'N')],
             'ENSP00000421447.1.exons.txt': [(587, 2, 'GT')],
             'ENSP00000472208.1.exons.txt': [(436, 3, 'CAG')],
             'ENSP00000433659.1.exons.txt': [(198, 1, 'G')],
             'ENSP00000476438.1.exons.txt': [(1, 2, 'NN')],
             'ENSP00000465855.1.exons.txt': [(1, 2, 'NN')],
             'ENSP00000388301.1.exons.txt': [(459, 1, 'G')],
             'EN