In [1]:
# 2018-07-16
# A. Pendleton
# Generating the per tissue bar chart tracks and
#    gene model tracks for the final gene models

In [2]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob
from scipy import stats
import re
from matplotlib_venn import venn3, venn3_circles
from collections import OrderedDict


def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)
# TO REMOVE TOP AND RIGHT AXIS OF PLOTS
def simpleaxis(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()

In [3]:
def get_sample_info():
    #Reading in the RNA-Seq library IDs
    sampleInfo = '/home/ampend/links/kidd-lab/ampend-projects/Novel_Sequence_Analysis/rna-seq/SampleInfoTable.txt'
    sampleDict, tissueDict, samples = {}, {}, []

    for line in open(sampleInfo, 'r'):
        if 'Bio' in line: #skips header
            continue
        line=line.rstrip().split()
        ID = line[4]
        #Rename some tissues to shorter name
        tissue = line[5]
        
        if 'SmoothMuscle' in tissue:
            shortTissue = 'SM'
        if 'Unidentified' in tissue:
            shortTissue = 'NA'
        if 'Blood' in tissue:
            shortTissue = 'BL'
        if 'Brain' in tissue:
            shortTissue = 'BR'
        if 'Heart' in tissue:
            shortTissue = 'HT'
        if 'Ovary' in tissue:
            shortTissue = 'OV'
        if 'Skin' in tissue:
            shortTissue = 'SK'
        if 'Kidney' in tissue:
            shortTissue = 'KD'
        if 'Testis' in tissue:
            shortTissue = 'TS'
        if 'Liver' in tissue:
            shortTissue = 'LV'
        if 'Lung' in tissue:
            shortTissue = 'LG'
        sampleDict[ID] = []
        sampleDict[ID].append(tissue)
        sampleDict[ID].append(shortTissue)
        samples.append(ID)
        
        if tissue not in tissueDict.keys():
            tissueDict[tissue] = []
        tissueDict[tissue].append(ID)

    print('%i samples added to a sample array from the following tissues:\n' % len(samples))

    genomes = ['zoey-2.3','canFam3.1-noY']

    for tissue in tissueDict.keys():
        print(tissue)

    return sampleDict, tissueDict, genomes



## Reading in Library/Sample Information

In [4]:
#Reading in library information
sampleDict, tissueDict, genomes = get_sample_info()

samples,tissues=[],[]

for key in sampleDict.keys():
    samples.append(key)
for tissue in tissueDict.keys():
    tissues.append(tissue)

42 samples added to a sample array from the following tissues:

Unidentified
Skin
Kidney
Brain
Heart
Testis
Liver
Blood
SmoothMuscle
Ovary
Lung


## Reading in Kallisto Results

In [9]:
#CALCULATE STATS AND DRAW PLOTS PER SAMPLE
tissueCount = 0

tpmDict = {}

kalDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/Kalisto/'
resultsDir = kalDir + 'results/'


for tissue in tissueDict.keys():
    print(tissue)

    inFile = resultsDir + tissue + '/' + 'abundance.tsv'
    for line in open(inFile,'r'):
        if 'target_id' in line: #skip header
            continue
        line=line.rstrip().split('\t')
        transcript = line[0]
        shortID = line[0].split('::')[1]
        tpm = float(line[4])
        
        #create a key for each gene ID / short ID
        if shortID not in tpmDict.keys(): #
            tpmDict[shortID] = {}
        
        #If tissue has not been seen before, create 
        if tissue not in tpmDict[shortID].keys():
            tpmDict[shortID][tissue] = 0.0
        
        tpmDict[shortID][tissue] = tpm #add TPM to the tissue value   
    
   

Unidentified
Skin
Kidney
Brain
Heart
Testis
Liver
Blood
SmoothMuscle
Ovary
Lung


# MAKING UCSC TRACKS

## Set colors for the tracks per tissue type

In [11]:
Ovary = 'rgb(255, 179, 255)'
Heart = 'rgb(255, 0, 191)'
Blood = 'rgb(204, 51, 0)'
Liver = 'rgb(255, 166, 77)'
Testis = 'rgb(179, 179, 204)'
Brain = 'rgb(255, 255, 0)'
SmoothMuscle = 'rgb(153, 51, 255)'
Unidentified = 'rgb(0, 0, 0)'
Skin = 'rgb(0, 102, 255)'
Kidney = 'rgb(191, 128, 64)'
Lung = 'rgb(0, 153, 51)'


# GET ZOEY2.3 COORDINATE  OF EACH GENE FROM BED FILE

In [14]:
bedFile = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedundant_NoRMIntersect_FilteredGeneSet/TotalSet_NoRMSingleExons_AllMultiExons.bed'

posDict = {}

for line in open(bedFile,'r'):
    line=line.rstrip().split('\t')
    gene = line[3]
    shortID = line[3].split('::')[1]
    chrom,start,end = line[0],line[1],line[2]
    orient = str(line[5])

    if shortID not in posDict.keys():
        posDict[shortID] = {}
        posDict[shortID] = [chrom,start,end,gene,orient]
    else:
        print('Error -- redundant gene model -- Check')
        print(shortID)
        break
print('added coordinates in zoey2.3 to %i genes ' % len(posDict.keys()))


added coordinates in zoey2.3 to 24891 genes 


# GET TPMS PER GENE FROM KALISTO ABUNDANCE TSV FILES

In [17]:
#CALCULATE STATS AND DRAW PLOTS PER SAMPLE
tissueCount = 0

tpmDict = {}

kalDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/Kalisto/'
resultsDir = kalDir + 'results/'

for tissue in tissueDict.keys():
    tissueCount += 1
    tpms = []
    
    inFile = resultsDir + tissue + '/' + 'abundance.tsv'
    for line in open(inFile,'r'):
        if 'target_id' in line:
            continue
        line=line.rstrip().split('\t')
        transcript = line[0]
        shortID = line[0].split('::')[1]
        tpm = float(line[4])
        #if only the first sample, create key for the gene
        if tissueCount == 1:
            #0 = ALL library expression TPMs
            #1 = Only libraries with TPM > 1.0
            tpmDict[shortID] = [[],[]]
        tpmDict[shortID][0].append(tpm) #add TPM to total dictionary      
        
        #If expressed at tpm > 1, add to dictionary
        if tpm > 1.0:
            tpmDict[shortID][1].append(tpm) #add TPM to total dictionary      
print('Added expression data from %i gene IDs' % len(tpmDict.keys()))
   

Added expression data from 24891 gene IDs


# MAKING UCSC TRACK

In [18]:
kalDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/Kalisto/'
trackDir = kalDir + 'UCSC_Tracks/'
resultsDir = kalDir + 'results/'

trackFile = open(trackDir + 'ForUCSC_PerTissueExpression.barChart','w')
for gene in tpmDict.keys():
    chrom,start,end,orient = posDict[gene][0],posDict[gene][1],posDict[gene][2],posDict[gene][4]
    tpms = ','.join(map(str, tpmDict[gene][0])) 
    score = '999' #default
    trackFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%i\t%s\n' % (chrom,start,end,gene,score,orient,gene,len(tpmDict[gene][0]),tpms))
    #chr14 95086227 95158010 ENSG00000100697.10 999 - DICER1 5 10.94,11.60,8.00,6.69,4.89 93153 26789

trackFile.close()

# SORTING AND CONVERTING TO BB

In [23]:
trackfile = trackDir + 'ForUCSC_PerTissueExpression.barChart'

#SORT BEDFILE OF UCSC TRACKS
cmd = 'bedSort %s %s.sorted' % (trackfile,trackfile)
print(cmd)
runCMD(cmd)

#Make into BB file 
chromFile = trackDir + 'zoey.2.3.chrom.sizes'
cmd = 'bedToBigBed -bedFields=4 %s.sorted %s %s.sorted.bb' % (trackfile, chromFile, trackfile)
print(cmd)
runCMD(cmd)

bedSort /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/Kalisto/UCSC_Tracks/ForUCSC_PerTissueExpression.barChart /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/Kalisto/UCSC_Tracks/ForUCSC_PerTissueExpression.barChart.sorted
bedToBigBed -bedFields=4 /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/Kalisto/UCSC_Tracks/ForUCSC_PerTissueExpression.barChart.sorted /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/Kalisto/UCSC_Tracks/zoey.2.3.chrom.sizes /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/Kalisto/UCSC_Tracks/ForUCSC_PerTissueExpression.barChart.sorted.bb


In [24]:
#DONE