In [74]:
# 2018-02-08
# A. Pendleton
# Calculating RNA-Seq stats

In [75]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob
from scipy import stats
import re
from matplotlib_venn import venn3, venn3_circles

def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)
def write_pbs_lines(scriptsDir,sample,cmdFile,wkDir,name,mem):
    #WRITE PBS SCRIPT
    cmdFile.write('#!/bin/bash\n')
    cmdFile.write('#PBS -S /bin/bash\n')
    cmdFile.write('#PBS -V\n')
    cmdFile.write('#PBS -M ampend@med.umich.edu\n')
    cmdFile.write('#PBS -j oe\n')
    cmdFile.write('#PBS -N %s_%s\n' % (sample,name))
    cmdFile.write('#PBS -o /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/logs/\n')
    cmdFile.write('#PBS -l pmem=%sG\n' % mem)
    cmdFile.write('#PBS -l nodes=1:ppn=1,qos=flux,walltime=100:00:00\n')
    cmdFile.write('##PBS -A medbsm_flux\n')
    cmdFile.write('##PBS -q flux\n')	
    cmdFile.write('#PBS -A jmkidd_fluxod\n')
    cmdFile.write('#PBS -q fluxod\n')
    cmdFile.write('cd %s\n' % wkDir)
    cmdFile.write('\n\n')
def write_pbs_task_lines(scriptsDir,sample,pbsFile,wkDir,name,mem,taskCount,cmdfile):
    #WRITE PBS SCRIPT
    pbsFile.write('#!/bin/bash\n')
    pbsFile.write('#PBS -S /bin/bash\n')
    pbsFile.write('#PBS -V\n')
    pbsFile.write('#PBS -M ampend@med.umich.edu\n')
    pbsFile.write('#PBS -j oe\n')
    pbsFile.write('#PBS -N %s_%s\n' % (sample,name))
    pbsFile.write('#PBS -o /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/logs/\n')
    pbsFile.write('#PBS -l pmem=%sG\n' % mem)
    pbsFile.write('#PBS -l nodes=1:ppn=1,qos=flux,walltime=100:00:00\n')
    pbsFile.write('##PBS -A medbsm_flux\n')
    pbsFile.write('##PBS -q flux\n')
    pbsFile.write('#PBS -A jmkidd_fluxod\n')
    pbsFile.write('#PBS -q fluxod\n')
    pbsFile.write('#PBS -t 1-%i\n' % (taskCount))
    
    pbsFile.write('cd %s\n' % wkDir)
    pbsFile.write('/home/ampend/links/kidd-lab/jmkidd-projects/scripts/perlUtils/run-by-id-log.pl %s %s $PBS_ARRAYID' % (cmdfile,cmdfile.replace('/scripts/','/logs/')))
    pbsFile.write('\n\n')

### What samples and genomes are you processing?

In [76]:
#Reading in the RNA-Seq library IDs
sampleInfo = '/home/ampend/links/kidd-lab/ampend-projects/Novel_Sequence_Analysis/rna-seq/SampleInfoTable.txt'
samples = []
sampleDict = {}

for line in open(sampleInfo, 'r'):
    if 'Bio' in line: #skips header
        continue
    line=line.rstrip().split()
    ID = line[4]
    tissue = line[5]
    samples.append(ID)
    if tissue not in sampleDict.keys():
        sampleDict[tissue] = []
    sampleDict[tissue].append(ID)
    
print('%i samples added to a sample array from the following tissues:\n' % len(samples))

genomes = ['zoey-2.3','canFam3.1-noY']

for tissue in sampleDict.keys():
    print(tissue)


42 samples added to a sample array from the following tissues:

Blood
Skin
Ovary
Kidney
Liver
SmoothMuscle
Testis
Lung
Unidentified
Brain
Heart


In [4]:
scriptsDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/'

### Run fastqc on the raw FASTQ files

In [59]:
fqDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/RawFASTQ/'
fqcDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/fastqc/fastqc_initial/'

for sample in samples:
    cmdFile = open(scriptsDir + 'fastqc/initial/' + '%s_initial_fastqc_commands.cmds' % (sample),'w')
    mem='6'
    write_pbs_lines(scriptsDir,sample,cmdFile,fqDir,'fastqc_initial',mem)

    #READ ONE
    cmd = 'gunzip %s%s_1.fastq.gz' % (fqDir,sample)
    cmdFile.write(cmd + '\n')    
    cmd = 'fastqc %s%s_1.fastq -o %s' % (fqDir,sample,fqcDir)
    cmdFile.write(cmd + '\n')
    cmd = 'gzip %s%s_1.fastq' % (fqDir,sample)
    cmdFile.write(cmd + '\n')
    #READ TWO
    cmd = 'gunzip %s%s_2.fastq.gz' % (fqDir,sample)
    cmdFile.write(cmd + '\n')
    cmd = 'fastqc %s%s_2.fastq -o %s' % (fqDir,sample,fqcDir)
    cmdFile.write(cmd + '\n')
    cmd = 'gzip %s%s_2.fastq' % (fqDir,sample)
    cmdFile.write(cmd + '\n')
    
    cmdFile.close()    
    
    

### Run Rcorrector on the raw FASTQ files

In [70]:
fqDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/RawFASTQ/'

cmdFile = open(scriptsDir + 'Rcorrector_commands.cmds','w')
mem='10'
write_pbs_lines(scriptsDir,'RCorrector',cmdFile,fqDir,'total',mem)

for sample in samples:
    cmd = 'perl /home/ampend/links/kidd-lab-scratch/ampend-projects/rCorrector/Rcorrector/run_rcorrector.pl -1 %s%s_1.fastq.gz -2 %s%s_2.fastq.gz' % (fqDir,sample,fqDir,sample)
    cmdFile.write(cmd + '\n')    
    
cmdFile.close()    
    

##### Check that they all finished

In [30]:
Dir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Rcorrected_FASTQ/'
done = []

for file in glob.glob(Dir + '*cor.fq.gz'):
    sample = file.replace(Dir,'').split('.')[0]
    done.append(sample)

pairs = [1,2]

for pair in pairs:
    for sample in samples:
        f = sample + '_' + str(pair)
        if f in done:
            continue
        else:
            print(f)

SRR536884_2


### Fix Rcorrector FASTQ outputs with custom script

In [77]:
fqDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Rcorrected_FASTQ/'

#write PBS
cmdfile = scriptsDir + 'FixRcorrFastq_commands.pbs'
cmdFile = open(scriptsDir + 'FixRcorrFastq_commands.pbs','w')
mem='10'
write_pbs_lines(scriptsDir,'FixRCorrector',cmdFile,fqDir,'total',mem)
cmdFile.write('/home/jmkidd/links/kidd-lab/jmkidd-projects/scripts/perlUtils/run-by-id-log.pl %s %s $PBS_ARRAYID' % (cmdfile.replace('pbs','cmds'),cmdfile.replace('.pbs','.logs').replace('/scripts/','/logs/')))
cmdFile.close()

#write commands file
cmdFile = open(scriptsDir + 'FixRcorrFastq_commands.cmds','w')
for sample in samples:
    cmd = 'python /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/filterUncorrectabledPEfastq.py -1 %s%s_1.cor.fq.gz -2 %s%s_2.cor.fq.gz -o fixed 2>&1 -s %s' % (fqDir,sample,fqDir,sample,sample)
    cmdFile.write(cmd + '\n')     
cmdFile.close()    
    

### Run fastqc on Fixed Rcorrector FASTQ outputs

In [7]:
fqDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Fixed_Rcorrected_FASTQ/'
fqcDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/fastqc/Fixed_RcorrectedFASTQ_fastqc/'

for sample in samples:
    cmdFile = open(scriptsDir + 'fastqc/Fixed_Rcorrected/' + '%s_Rcorrected_fastqc_commands.cmds' % (sample),'w')
    mem='6'
    write_pbs_lines(scriptsDir,sample,cmdFile,fqDir,'fastqc_FixRcorr',mem)

    #READ ONE
    cmd = 'gunzip %sfixed_%s_1.cor.fq.gz' % (fqDir,sample)
    cmdFile.write(cmd + '\n')    
    cmd = 'fastqc %sfixed_%s_1.cor.fq -o %s' % (fqDir,sample,fqcDir)
    cmdFile.write(cmd + '\n')
    cmd = 'gzip %sfixed_%s_1.cor.fq' % (fqDir,sample)
    cmdFile.write(cmd + '\n')
    #READ TWO
    cmd = 'gunzip %sfixed_%s_2.cor.fq.gz' % (fqDir,sample)
    cmdFile.write(cmd + '\n')    
    cmd = 'fastqc %sfixed_%s_2.cor.fq -o %s' % (fqDir,sample,fqcDir)
    cmdFile.write(cmd + '\n')
    cmd = 'gzip %sfixed_%s_2.cor.fq' % (fqDir,sample)
    cmdFile.write(cmd + '\n')
    
    cmdFile.close()    
    
    

### Trimmomatic clean ups based on the fastqc run above

Example commands:
module load trimmomatic

java -jar ${TRIMM_JAR}/trimmomatic-0.36.jar \
PE SRR388741_1.fastq.gz SRR388741_2.fastq.gz -baseout SRR388741_sorted \
LEADING:10 SLIDINGWINDOW:4:20 MINLEN:50


In [23]:
#Write PBS file
cmdfile = scriptsDir + 'Trimmomatic_commands.pbs'
cmdFile = open(cmdfile,'w')
mem='8'
taskCount = 42
write_pbs_task_lines(scriptsDir,'total',cmdFile,fqDir,'trimmomatic',mem,taskCount)
cmdFile.write('module load trimmomatic\n')
cmdFile.write('/home/jmkidd/links/kidd-lab/jmkidd-projects/scripts/perlUtils/run-by-id-log.pl %s %s $PBS_ARRAYID\n' % (cmdfile.replace('pbs','cmds'),cmdfile.replace('.pbs','.logs').replace('/scripts/','/logs/')))
cmdFile.close()


##########################################################################
#Write commands file
fqDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Fixed_Rcorrected_FASTQ/'
fqOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Trimmomatic_Fixed_Rcorrected_FASTQ/'

cmdFile = open(scriptsDir + 'Trimmomatic_commands.cmds','w')
for sample in samples:
    cmd = 'java -jar ${TRIMM_JAR}/trimmomatic-0.36.jar PE %sfixed_%s_1.cor.fq.gz %sfixed_%s_2.cor.fq.gz -baseout %sfixed_%s_cor_trim LEADING:10 SLIDINGWINDOW:4:20 MINLEN:50' % (fqDir,sample,fqDir,sample,fqOutDir,sample)
    cmdFile.write('%s\n' % cmd)
cmdFile.close()



### Append /1 and /2 to the fixed Rcorrected reads

In [28]:
fastqInDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Trimmomatic_Fixed_Rcorrected_FASTQ/'
fastqOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Fastool_Trimmomatic_Fixed_Rcorrected_FASTQ/'

for sample in samples:
    cmdFile = open(scriptsDir + 'fastool/' + '%s_fastool_commands.cmds' % sample, 'w')
    mem = '10'
    write_pbs_lines(scriptsDir,sample,cmdFile,fqDir,'fastool_TrimFixRcorr',mem)

    #load module
    cmd = 'module load fastool'
    cmdFile.write('%s\n' % cmd)

    ###FASTOOL 
    ###Writing commands to unzip the starting FASTQ file
    cmd = 'gunzip %sfixed_%s_cor_trim_1P.gz' % (fastqInDir,sample)
    cmdFile.write('%s\n' % cmd)
    cmd = 'gunzip %sfixed_%s_cor_trim_2P.gz' % (fastqInDir,sample)
    cmdFile.write('%s\n' % cmd)

    ###Writing commands to convert the FASTQ files into proper format
    cmd = 'fastool --append /1 %sfixed_%s_cor_trim_1P > %s%s_Rcorrected_filtered_formatted_1P.fq' % (fastqInDir,sample,fastqOutDir,sample)
    cmdFile.write('%s\n' % cmd)
    cmd = 'fastool --append /2 %sfixed_%s_cor_trim_2P > %s%s_Rcorrected_filtered_formatted_2P.fq' % (fastqInDir,sample,fastqOutDir,sample)
    cmdFile.write('%s\n' % cmd)

    #Commands to change the '>' symbol that fastool adds into a '@' symbol for the first line
    cmd = 'sed -i \'s/>%s/@%s/g\' %s%s_Rcorrected_filtered_formatted_1P.fq' % (sample,sample,fastqOutDir,sample)
    cmdFile.write('%s\n' % cmd)
    cmd = 'sed -i \'s/>%s/@%s/g\' %s%s_Rcorrected_filtered_formatted_2P.fq' % (sample,sample,fastqOutDir,sample)
    cmdFile.write('%s\n' % cmd)

    #Writing commands to re-zip the starting FASTQ file
    cmd = 'gzip %s%s_Rcorrected_filtered_1P*' % (fastqInDir,sample)
    cmdFile.write('%s\n' % cmd)
    cmd = 'gzip %s%s_Rcorrected_filtered_2P*' % (fastqInDir,sample)
    cmdFile.write('%s\n' % cmd)
    
    cmdFile.close()


In [None]:
### Run FASTQC on the trimmomatic processed fastq files

In [49]:
fqDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Trimmomatic_Fixed_Rcorrected_FASTQ/'
fqcDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/fastqc/trimmomatic-processed/'

for sample in samples:
    cmdFile = open(scriptsDir + 'fastqc/trimmomatic-processed/' + '%s_trimmomatic-processed_fastqc_commands.cmds' % (sample),'w')
    mem='6'
    write_pbs_lines(scriptsDir,sample,cmdFile,fqDir,'fastqc_trimmo',mem)

    #READ ONE
    cmd = 'gunzip %sfixed_%s_cor_trim_1P.gz' % (fqDir,sample)
    cmdFile.write(cmd + '\n')    
    cmd = 'fastqc %sfixed_%s_cor_trim_1P -o %s' % (fqDir,sample,fqcDir)
    cmdFile.write(cmd + '\n')
    cmd = 'gzip %sfixed_%s_cor_trim_1P' % (fqDir,sample)
    cmdFile.write(cmd + '\n')
    #READ TWO
    cmd = 'gunzip %sfixed_%s_cor_trim_2P.gz' % (fqDir,sample)
    cmdFile.write(cmd + '\n')
    cmd = 'fastqc %sfixed_%s_cor_trim_2P -o %s' % (fqDir,sample,fqcDir)
    cmdFile.write(cmd + '\n')
    cmd = 'gzip %sfixed_%s_cor_trim_2P' % (fqDir,sample)
    cmdFile.write(cmd + '\n')
    
    cmdFile.close()    
    
    

In [None]:
### Run FASTQC on the formatted-trimmomatic-Rcorrected fastq

In [101]:
fqDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Fastool_Trimmomatic_Fixed_Rcorrected_FASTQ/'
fqcDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/fastqc/formatted/'

for sample in samples:
    cmdFile = open(scriptsDir + 'fastqc/formatted/' + '%s_formatted_fastqc_commands.cmds' % (sample),'w')
    mem='6'
    write_pbs_lines(scriptsDir,sample,cmdFile,fqDir,'fastqc_formatted',mem)

    #READ ONE
    cmd = 'gunzip %s%s_Rcorrected_filtered_formatted_1P.fq.gz' % (fqDir,sample)
    cmdFile.write(cmd + '\n')    
    cmd = 'fastqc %s%s_Rcorrected_filtered_formatted_1P.fq -o %s' % (fqDir,sample,fqDir)
    cmdFile.write(cmd + '\n')
    cmd = 'gzip %s%s_Rcorrected_filtered_formatted_1P.fq' % (fqDir,sample)
    cmdFile.write(cmd + '\n')
    #READ TWO
    cmd = 'gunzip %s%s_Rcorrected_filtered_formatted_2P.fq.gz' % (fqDir,sample)
    cmdFile.write(cmd + '\n')    
    cmd = 'fastqc %s%s_Rcorrected_filtered_formatted_2P.fq -o %s' % (fqDir,sample,fqDir)
    cmdFile.write(cmd + '\n')
    cmd = 'gzip %s%s_Rcorrected_filtered_formatted_2P.fq' % (fqDir,sample)
    cmdFile.write(cmd + '\n')

    cmdFile.close()    
    
    

In [93]:
### Freedman script to remove fastqc high abundance sequences

In [103]:
fqDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Fastool_Trimmomatic_Fixed_Rcorrected_FASTQ/'

#write PBS
cmdfile = scriptsDir + 'removeFastqcSeq_commands.pbs'
cmdFile = open(cmdfile,'w')
mem='2'
write_pbs_lines(scriptsDir,'RemoveSeq',cmdFile,fqDir,'total',mem)
cmdFile.write('/home/jmkidd/links/kidd-lab/jmkidd-projects/scripts/perlUtils/run-by-id-log.pl %s %s $PBS_ARRAYID' % (cmdfile.replace('pbs','cmds'),cmdfile.replace('.pbs','.logs').replace('/scripts/','/logs/')))
cmdFile.close()

#write commands file
cmdFile = open(cmdfile.replace('.pbs','.cmds'),'w')
outDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/OverRepRemoved_Fastool_Trimmomatic_Fixed_Rcorrected_FASTQ/'
for sample in samples:
    cmd = 'python /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/RemoveFastqcOverrepSequenceReads.py '
    cmd += '-1 %s%s_Rcorrected_filtered_formatted_1P.fq.gz ' % (fqDir,sample)
    cmd += '-2 %s%s_Rcorrected_filtered_formatted_2P.fq.gz ' % (fqDir,sample)
    cmd += '-outDir %s' % (outDir)
    cmdFile.write(cmd + '\n')     
cmdFile.close()    
    

### Write GSNAP commands

In [43]:
fastqOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Fastool_Trimmomatic_Fixed_Rcorrected_FASTQ/'

for sample in samples:	
    for genome in genomes:
        #Create command file for just one sample at a time
        if 'zoey' in genome:
            cmdFile = open(scriptsDir + 'Zoey-GSNAP-Rcorrected/' + '%s_%s_GSNAP-processing.cmds' % (genome,sample),'w')
        else:
            cmdFile = open(scriptsDir + 'CanFam-GSNAP-Rcorrected/' + '%s_%s_GSNAP-processing.cmds' % (genome,sample),'w')
        mem = '15'
        write_pbs_lines(scriptsDir,genome + '_' + sample,cmdFile,fastqOutDir,'gsnap',mem)

        #Writing commands to align using GSNAP-GMAP
        gsnapDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/GMAP-GSNAP/'
        gsnapOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/'
        ####RUN GSNAP/GMAP
        cmd = '%sbin/gsnap -d %s -D %sgenomes/%s -n 10 --quiet-if-excessive -m 5 -A sam -N 1 -o %s%s_Rcorrected/%s_m5_gsnap.sam %s%s_Rcorrected_filtered_formatted_1P.fq %s%s_Rcorrected_filtered_formatted_2P.fq' % (gsnapDir,genome,gsnapDir,genome,gsnapOutDir,genome,sample,fastqOutDir,sample,fastqOutDir,sample)
        cmdFile.write('%s\n' % cmd)
        cmdFile.close

        ####MAKE BAM FILE FROM GSNAP OUTFILE
        cmd = 'samtools view -bS %s%s_Rcorrected/%s_m5_gsnap.sam | samtools sort - %s%s_Rcorrected/%s_m5_gsnap_sorted' % (gsnapOutDir,genome,sample,gsnapOutDir,genome,sample)
        cmdFile.write('%s\n' % cmd)

        
        ##########################################################
        ##########################################################
        #WRITE TRINITY COMMANDS
        if 'zoey' in genome:
            cmdFile = open(scriptsDir + 'Zoey-Trinity-Rcorrected/' + '%s_%s_Trinity-processing.cmds' % (genome,sample),'w')
        else:
            cmdFile = open(scriptsDir + 'CanFam-Trinity-Rcorrected/' + '%s_%s_Trinity-processing.cmds' % (genome,sample),'w')

        write_pbs_lines(scriptsDir,sample,cmdFile,fastqOutDir,'%s_trin_pipe' % genome,'20')
        ###CREATE TRINITY OUTPUT DIRECTOR
        trinityOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Trinity_RNA-Seq_Analysis/trinity_alignments/' + '%s/%s_trinity/' % (genome,sample)		
        cmd = 'mkdir -p %s' % (trinityOutDir)
        cmdFile.write('%s\n' % cmd)

        ###GUNZIP INPUT BAM FILE
        bamfile = '%s%s/%s_m5_gsnap_sorted' % (gsnapOutDir,genome,sample)
        cmd = 'gunzip %s.gz' % bamfile
        cmdFile.write('#%s\n' % cmd)	

        ####RUN TRINITY
        cmd = 'Trinity --seqType fq --max_memory 10G --left %s%s_filtered_formatted_1P.fq --right %s%s_filtered_formatted_2P.fq --min_contig_length 200 --output %s --genome_guided_bam %s%s_reformatted/%s_m5_gsnap_sorted.bam --genome_guided_max_intron 200000' % (fastqOutDir,sample,fastqOutDir,sample,trinityOutDir,gsnapOutDir,genome,sample)
        cmdFile.write('#%s\n' % cmd)	

        ###GZIP INPUT BAM FILE
        cmd = 'gzip %s' % bamfile
        cmdFile.write('#%s\n' % cmd)	

        #break ## just do zoey for now and see if it works
        #Gzip the re-formatted FASTQ output files once GSNAP is done
        cmd = 'gzip %s%s_filtered_formatted_1P.fq' % (fastqOutDir,sample)
        cmdFile.write('#%s\n' % cmd)
        cmd = 'gzip %s%s_filtered_formatted_2P.fq' % (fastqOutDir,sample)
        cmdFile.write('#%s\n' % cmd)	

        cmdFile.close


#### N=1 so that there is only one hit per read

In [45]:
fastqOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/Fastool_Trimmomatic_Fixed_Rcorrected_FASTQ/'

for sample in samples:	
    for genome in genomes:
        #Create command file for just one sample at a time
        if 'zoey' in genome:
            cmdFile = open(scriptsDir + 'Zoey-GSNAP-Rcorrected_n1/' + '%s_%s_GSNAP-processing.cmds' % (genome,sample),'w')
        else:
            cmdFile = open(scriptsDir + 'CanFam-GSNAP-Rcorrected_n1/' + '%s_%s_GSNAP-processing.cmds' % (genome,sample),'w')
        mem = '15'
        write_pbs_lines(scriptsDir,genome + '_' + sample,cmdFile,fastqOutDir,'gsnap_n1',mem)

        #Writing commands to align using GSNAP-GMAP
        gsnapDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/GMAP-GSNAP/'
        gsnapOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/%s_Rcorrected_n1/' % genome
        ####RUN GSNAP/GMAP
        cmd = '%sbin/gsnap -d %s -D %sgenomes/%s -n 1 --quiet-if-excessive -m 5 -A sam -N 1 -o %s%s_m5_gsnap.sam %s%s_Rcorrected_filtered_formatted_1P.fq %s%s_Rcorrected_filtered_formatted_2P.fq' % (gsnapDir,genome,gsnapDir,genome,gsnapOutDir,sample,fastqOutDir,sample,fastqOutDir,sample)
        cmdFile.write('%s\n' % cmd)
        cmdFile.close

        ####MAKE BAM FILE FROM GSNAP OUTFILE
        cmd = 'samtools view -bS %s%s_m5_gsnap.sam | samtools sort - %s%s_m5_gsnap_sorted' % (gsnapOutDir,sample,gsnapOutDir,sample)
        cmdFile.write('%s\n' % cmd)

        
        ##########################################################
        ##########################################################
        #WRITE TRINITY COMMANDS
        if 'zoey' in genome:
            cmdFile = open(scriptsDir + 'Zoey-Trinity-Rcorrected/' + '%s_%s_n1_Trinity-processing.cmds' % (genome,sample),'w')
        else:
            cmdFile = open(scriptsDir + 'CanFam-Trinity-Rcorrected/' + '%s_%s_n1_Trinity-processing.cmds' % (genome,sample),'w')

        write_pbs_lines(scriptsDir,sample,cmdFile,fastqOutDir,'%s_trin_pipe' % genome,'20')
        ###CREATE TRINITY OUTPUT DIRECTOR
        trinityOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Trinity_RNA-Seq_Analysis/trinity_alignments/' + '%s/%s_trinity/' % (genome,sample)		
        cmd = 'mkdir -p %s' % (trinityOutDir)
        cmdFile.write('%s\n' % cmd)

        ###GUNZIP INPUT BAM FILE
        bamfile = '%s%s_m5_gsnap_sorted' % (gsnapOutDir,sample)
        cmd = 'gunzip %s.gz' % bamfile
        cmdFile.write('#%s\n' % cmd)	

        ####RUN TRINITY
        cmd = 'Trinity --seqType fq --max_memory 10G --left %s%s_filtered_formatted_1P.fq --right %s%s_filtered_formatted_2P.fq --min_contig_length 200 --output %s --genome_guided_bam %s%s_m5_gsnap_sorted.bam --genome_guided_max_intron 200000' % (fastqOutDir,sample,fastqOutDir,sample,trinityOutDir,gsnapOutDir,sample)
        cmdFile.write('#%s\n' % cmd)	

        ###GZIP INPUT BAM FILE
        cmd = 'gzip %s' % bamfile
        cmdFile.write('#%s\n' % cmd)	

        #break ## just do zoey for now and see if it works
        #Gzip the re-formatted FASTQ output files once GSNAP is done
        cmd = 'gzip %s%s_filtered_formatted_1P.fq' % (fastqOutDir,sample)
        cmdFile.write('#%s\n' % cmd)
        cmd = 'gzip %s%s_filtered_formatted_2P.fq' % (fastqOutDir,sample)
        cmdFile.write('#%s\n' % cmd)	

        cmdFile.close


In [108]:
#### Gsnap for the overrep seqs removed
fastqOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/OverRepRemoved_Fastool_Trimmomatic_Fixed_Rcorrected_FASTQ/'

for sample in samples:	
    for genome in genomes:
        #Create command file for just one sample at a time
        if 'zoey' in genome:
            cmdFile = open(scriptsDir + 'Zoey-GSNAP-Rcorrected-Overrep_n1/' + '%s_%s_GSNAP-processing.cmds' % (genome,sample),'w')
        else:
            cmdFile = open(scriptsDir + 'CanFam-GSNAP-Rcorrected-Overrep_n1/' + '%s_%s_GSNAP-processing.cmds' % (genome,sample),'w')
        mem = '15'
        write_pbs_lines(scriptsDir,genome + '_' + sample,cmdFile,fastqOutDir,'gsnap_n1',mem)

        #Writing commands to align using GSNAP-GMAP
        gsnapDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/GMAP-GSNAP/'
        gsnapOutDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/%s_Rcorrected_OverRep_n1/' % genome
        
        #GUNZIP THE FQ FILES
        cmd = 'gunzip %s%s_Rcorrected_filtered_formatted_1P_rmoverrep.fq.gz' % (fastqOutDir,sample)
        cmdFile.write('%s\n' % cmd)
        cmd = 'gunzip %s%s_Rcorrected_filtered_formatted_2P_rmoverrep.fq.gz' % (fastqOutDir,sample)
        cmdFile.write('%s\n' % cmd)        
        
        ####RUN GSNAP/GMAP
        cmd = '%sbin/gsnap -d %s -D %sgenomes/%s -n 1 --quiet-if-excessive -m 5 -A sam -N 1 -o %s%s_m5_gsnap.sam %s%s_Rcorrected_filtered_formatted_1P_rmoverrep.fq %s%s_Rcorrected_filtered_formatted_2P_rmoverrep.fq' % (gsnapDir,genome,gsnapDir,genome,gsnapOutDir,sample,fastqOutDir,sample,fastqOutDir,sample)
        cmdFile.write('%s\n' % cmd)

        ####MAKE BAM FILE FROM GSNAP OUTFILE
        cmd = 'samtools view -bS %s%s_m5_gsnap.sam | samtools sort - %s%s_m5_gsnap_sorted' % (gsnapOutDir,sample,gsnapOutDir,sample)
        cmdFile.write('%s\n' % cmd)

        #ZIP THE FQ FILES
        cmd = 'gzip %s%s_Rcorrected_filtered_formatted_1P_rmoverrep.fq' % (fastqOutDir,sample)
        cmdFile.write('%s\n' % cmd)
        cmd = 'gzip %s%s_Rcorrected_filtered_formatted_2P_rmoverrep.fq' % (fastqOutDir,sample)
        cmdFile.write('%s\n' % cmd)       
        
        cmdFile.close


In [50]:
## Calculate flagstats for bam files using each GSNAP threshold

In [61]:
inDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/'
scriptDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/'
taskCount = 0 
cmdfile = scriptDir + 'flagstat_commands.cmds'
cmdFile = open(cmdfile, 'w')

#Write commands file
for genome in genomes:
    for sample in samples:
        bamDir = inDir + '%s_Rcorrected/' % genome
        taskCount += 1
        cmd = 'samtools flagstat %s%s_m5_gsnap_sorted.bam > %s%s_m5_gsnap_sorted.flagstat' % (bamDir,sample,bamDir,sample)
        cmdFile.write('%s\n' % cmd)
cmdFile.close()
    
#Write PBS file
pbsfile = cmdfile.replace('.cmds','.pbs')
pbsFile = open(pbsfile,'w')
mem='2'
write_pbs_task_lines(scriptsDir,'total',pbsFile,inDir,'flagstat',mem,taskCount,cmdfile)
pbsFile.write('/home/ampend/links/kidd-lab/jmkidd-projects/scripts/perlUtils/run-by-id-log.pl %s %s $PBS_ARRAYID\n' % (cmdfile.replace('pbs','cmds'),cmdfile.replace('.pbs','.logs').replace('/scripts/','/logs/')))
pbsFile.close()



## Cufflinks  by Tissue

In [73]:
inDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/'
scriptDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/'
taskCount = 0 
cmdfile = scriptDir + 'cufflinks_commands_bytissue.cmds'
cmdFile = open(cmdfile, 'w')

#Write commands file
for genome in genomes:
    cufflinksDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Cufflinks_GSNAPAlignments/'
    for sample in samples:
        bamDir = inDir + '%s_Rcorrected/' % genome
        taskCount += 1
        cmd = 'mkdir -p %s%s && cufflinks -o %s%s_Rcorrected/%s/ %s%s_m5_gsnap_sorted.bam' % (cufflinksDir,sample,cufflinksDir,genome,sample,bamDir,sample)
        cmdFile.write('%s\n' % cmd)
cmdFile.close()
    
#Write PBS file
pbsfile = cmdfile.replace('.cmds','.pbs')
pbsFile = open(pbsfile,'w')
mem='6'
write_pbs_task_lines(scriptsDir,'total',pbsFile,inDir,'cuff_n10',mem,taskCount,cmdfile)
pbsFile.close()

In [78]:
inDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/'
scriptDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/'
taskCount = 0 


#Write commands file
for genome in genomes:
    cmdfile = scriptDir + genome + '_Cufflinks_n1_OverRepRem.cmds'
    cmdFile = open(cmdfile, 'w')
    cufflinksDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Cufflinks_GSNAPAlignments/%s_Rcorrected_n1/' % genome
    for sample in samples:
        bamDir = inDir + '%s_Rcorrected_OverRep_n1/' % genome
        taskCount += 1
        cmd = 'mkdir -p %s%s && cufflinks -o %s%s/ %s%s_m5_gsnap_sorted.bam' % (cufflinksDir,sample,cufflinksDir,sample,bamDir,sample)
        cmdFile.write('%s\n' % cmd)
    cmdFile.close()

    #Write PBS file
    pbsfile = cmdfile.replace('.cmds','.pbs')
    pbsFile = open(pbsfile,'w')
    mem='6'
    write_pbs_task_lines(scriptsDir,'%s' % genome,pbsFile,inDir,'cuff_n1',mem,taskCount,cmdfile)
    
    pbsFile.close()
    


In [6]:
##########################################################################################
##########################################################################################
##########################################################################################

# Merge GSNAP BAMs by tissue and all together

In [37]:
#We want to merge the BAMs by tissue first, and then all together 
scriptsDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/'
cmdfile = scriptsDir + 'MergeBAMs.cmds'
cmdFile = open(cmdfile,'w')
cmdFile.write('module load bamtools\n')

taskCount = 0
for genome in genomes:
    print('\nGenome: %s' % genome)
    scriptDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/Cufflinks/%s/' % genome
    for tissue in sampleDict.keys():
        taskCount += 1 
        gsnapDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/%s_Rcorrected/' % genome
        cmd = 'bamtools merge -out %sMerged_%s_%s_m5_gsnap_sorted.bam ' % (gsnapDir,genome,tissue)
        for sample in sampleDict[tissue]:
            bamFile = gsnapDir + sample + '_m5_gsnap_sorted.bam'
            cmd += '-in %s ' % bamFile
        cmdFile.write('%s\n' % cmd)

cmdFile.close()

pbsFile = open(scriptsDir + 'MergeBAMs.pbs','w')
mem=10
write_pbs_task_lines(scriptsDir,'total',pbsFile,scriptsDir,'merge_bams',mem,taskCount,cmdfile)
pbsFile.close()


Genome: zoey-2.3

Genome: canFam3.1-noY


In [89]:
for genome in genomes:
    bamDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/%s_Rcorrected_n1/' % genome

    #MERGE
    cmd = 'bamtools merge '
    for sample in samples:
        cmd += '-in %s%s_m5_gsnap_sorted.bam ' % (bamDir,sample)
    cmd += '-out %sMerged_AllTissues_m5_gsnap.bam ' % bamDir

    #SORT
    cmd += '&& samtools sort %sMerged_AllTissues_m5_gsnap.bam %sMerged_AllTissues_m5_gsnap.sorted ' % (bamDir,bamDir)
    
    #REMOVE UNSORTED
    cmd += '&& rm %sMerged_AllTissues_m5_gsnap.bam' % bamDir
    print(cmd)
    

bamtools merge -in /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/zoey-2.3_Rcorrected_n1/SRR388734_m5_gsnap_sorted.bam -in /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/zoey-2.3_Rcorrected_n1/SRR388735_m5_gsnap_sorted.bam -in /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/zoey-2.3_Rcorrected_n1/SRR388747_m5_gsnap_sorted.bam -in /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/zoey-2.3_Rcorrected_n1/SRR388736_m5_gsnap_sorted.bam -in /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/zoey-2.3_Rcorrected_n1/SRR388741_m5_gsnap_sorted.bam -in /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/zoey-2.3_Rcorrected_n1/SRR388752_m5_gsnap_sorted.bam -in /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_

## Cuffmerge by tissue

In [92]:
inDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/'
scriptDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/'
taskCount = 0 

#Write commands file
for genome in genomes:
    cmdfile = scriptDir + genome + '_Cuffmerge_n1.cmds'
    cmdFile = open(cmdfile, 'w')
    cufflinksDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Cufflinks_GSNAPAlignments/%s_Rcorrected_n1/' % genome
    for tissue in sampleDict.keys():
        for sample in sampleDict[tissue]:
            print(sample)
        sys.exit()
        bamDir = inDir + '%s_Rcorrected_n1/' % genome
        taskCount += 1
        cmd = 'mkdir -p %s%s && cufflinks -o %s%s/ %s%s_m5_gsnap_sorted.bam' % (cufflinksDir,sample,cufflinksDir,sample,bamDir,sample)
        cmdFile.write('%s\n' % cmd)
    cmdFile.close()

    #Write PBS file
    pbsfile = cmdfile.replace('.cmds','.pbs')
    pbsFile = open(pbsfile,'w')
    mem='6'
    write_pbs_task_lines(scriptsDir,'%s' % genome,pbsFile,inDir,'cuff_n1',mem,taskCount,cmdfile)
    
    pbsFile.close()
    


SRR388749
SRR388753
SRR388757
Blood


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [3]:
cmdFile = open('/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/VALOR/scripts/' + 'CANFAM_calc_insertsize.cmds','w')
fosmidPath = '/home/ampend/links/kidd-lab/hoffmakl-projects/zoeyRef_valor/input/'
outDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/VALOR/input/canfam_alignments/insert_size_stats/'

for i in range(1,193):
    #Make the pool number always be three digits long
    i = str(i)
    if len(i) < 2:
        i = '00' + i 
    if len(i) == 2:
        i = '0' + i
    #picard command:
    cmd = 'java -jar $CLASSPATH/picard.jar CollectInsertSizeMetrics I=%sZoey_%s.markdup.bam O=%sZoey_%s_insert_size.txt H=%sZoey_%s_insert_size_histogram.pdf M=0.5' % (fosmidPath, i, outDir, i, outDir, i)
    cmdFile.write('%s\n' % cmd)

cmdFile.close()

In [4]:
outDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/VALOR/input/canfam_alignments/insert_size_stats/'

for i in range(1,193):
    #Make the pool number always be three digits long
    i = str(i)
    if len(i) < 2:
        i = '00' + i 
    if len(i) == 2:
        i = '0' + i
    
    file = outDir + 'Zoey_184_insert_size.txt'
    for line in open(file,'r'):
        line=line.rstrip()
        print(line)
        sys.exit()

## htsjdk.samtools.metrics.StringHeader


NameError: name 'sys' is not defined

In [5]:
import sys

In [10]:
#Set up empty table / dict 
counts = {}
#0 = left raw reads, #1 = right raw reads
#2 = filtered/formatted left reads,  #3 = filtered/formatted right reads, 
#4 = completed, sorted Zoey BAM, #5 = completed, sorted canFam BAM, 

numberOfIndices = 5

for sample in samples:
    counts[sample] = ['','','','','','']
print('Counts dictionary has %i samples' % len(counts.keys()))    

Counts dictionary has 42 samples


## Checking that the Raw FASTQ files have equal L and R reads prior to filtration

In [26]:
fastqDir = '/home/ampend/links/kidd-lab/ampend-projects/Novel_Sequence_Analysis/rna-seq/input/fastq/'
rawDir = fastqDir + 'RawFASTQ/'
fastqcDir = fastqDir + 'fastqc_files/' #outdir for fastqc to write its info

countFile = open(fastqDir + 'ReadCounts_RawFASTQ.txt','w')
for sample in samples:
    for i in range(1,3): #files _1 and _2 == F and R, respectively
        file = rawDir + sample + '_%i.fastq.gz' % (i)
        
        cmd = 'echo "%s_%i" >> %sRawReadCounts.txt' % (sample,i,fastqDir)
        runCMD(cmd)
        
        cmd = 'zcat %s | wc -l >> %sRawReadCounts.txt' % (file,fastqDir)
        runCMD(cmd)
    countFile.close()

In [53]:
for line in open('/home/ampend/links/kidd-lab/ampend-projects/Novel_Sequence_Analysis/rna-seq/input/fastq/ReadCounts_RawFASTQ.txt','r'):
    line=line.rstrip().split()
    if 'SRR' in line[0]:
        sample = line[0].split('_')[0]
        continue
    if sample not in counts.keys():
        counts[sample][0] = int(line[0])/4
        continue
    else:
        counts[sample][1] = int(line[0])/4
for sample in counts.keys():
    read1,read2 = counts[sample][0],counts[sample][0]
    if read1 != read2:
        print(sample,counts[sample])
###ALL GOOD


## Calculating the number of reads left after filtration

In [None]:
filtDir = fastqDir + 'FormattedFastool_FilteredFASTQ_Trimmomatic/'

countFile = open(fastqDir + 'ReadCounts_FilteredFormatted.txt','w')
for sample in samples:
    for i in range(1,3): #files _1 and _2 == F and R, respectively
        file = filtDir + sample + '_filtered_formatted_%iP.fq' % (i)
        
        cmd = 'echo "%s_%i" >> %sReadCounts_FilteredFormatted.txt' % (sample,i,fastqDir)
        runCMD(cmd)
        
        cmd = 'wc -l %s >> %sReadCounts_FilteredFormatted.txt' % (file,fastqDir)
        runCMD(cmd)
countFile.close()

In [7]:
counts = {}
for line in open('/home/ampend/links/kidd-lab/ampend-projects/Novel_Sequence_Analysis/rna-seq/input/fastq/ReadCounts_RawFASTQ.txt','r'):
    line=line.rstrip().split()
    if 'SRR' in line[0]:
        sample = line[0].split('_')[0]
        continue
    if '_1' in line[0]:
        counts[sample][2] = int(line[0])/4
    if '_2' in line[0]:
        counts[sample][3] = int(line[0])/4
for sample in counts.keys():
    read1,read2 = counts[sample][2],counts[sample][3]
    if read1 != read2:
        print(sample,counts[sample])
###ALL GOOD


## Checking RNA-Seq libraries that need to be aligned with GSNAP still

In [11]:
alignmentDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Alignments_GSNAP/'

for genome in genomes:
    for sample in samples:
        for file in glob.glob(alignmentDir + genome + '_reformatted/' + '*sorted.bam*'):
            if sample in file:
                if 'zoey' in genome:
                    counts[sample][4] = True
                else:
                    counts[sample][5] = True

In [16]:
#WHAT ZOEY COMMAND FILES SHOULD WE SUBMIT
count=0
for sample in counts.keys():
    if counts[sample][4] != True:
        print('zoey-2.3_%s_GSNAP-processing.cmds' %sample)
        count+=1
print(count)

#WHAT CANFAM COMMAND FILES SHOULD WE SUBMIT
count=0
for sample in counts.keys():
    if counts[sample][5] != True:
        print('*_%s_GSNAP-processing.cmds' %sample)
        count+=1
print(count)

zoey-2.3_SRR543734_GSNAP-processing.cmds
zoey-2.3_SRR388760_GSNAP-processing.cmds
2
*_SRR388734_GSNAP-processing.cmds
*_SRR543734_GSNAP-processing.cmds
*_SRR543732_GSNAP-processing.cmds
*_SRR388738_GSNAP-processing.cmds
*_SRR543735_GSNAP-processing.cmds
*_SRR388751_GSNAP-processing.cmds
*_SRR536881_GSNAP-processing.cmds
*_SRR388760_GSNAP-processing.cmds
*_SRR536885_GSNAP-processing.cmds
*_SRR536884_GSNAP-processing.cmds
10


In [17]:
###########################################################

# Trinity Stats

In [18]:
trinDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/Trinity_RNA-Seq_Analysis/trinity_alignments/no-genome-guided/'

In [None]:
for sample in samples:
    for file in glob.glob(trinDir + sample + '/' + )

In [25]:
scriptsDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/'

### Fix Rcorrector fastq files

In [54]:
fqDir = '/home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/input/RawFASTQ/'

cmdFile = open(scriptsDir + 'FixRcorrFastq_commands.cmds','w')
mem='10'
write_pbs_lines(scriptsDir,'FixRCorrector',cmdFile,fqDir,'total',mem)

for sample in samples:
    cmd = 'python /home/ampend/links/kidd-lab-scratch/ampend-projects/Zoey_Genome_Project/rna-seq/scripts/filterUncorrectabledPEfastq.py -1 %s%s.cor.fq.gz -2 %s%s.cor.fq.gz -o fixed 2>&1' % (fqDir,sample,fqDir,sample)
    cmdFile.write(cmd + '\n')    
    
cmdFile.close()    
    