In [1]:
import os

import pandas as pd

In [3]:
pd.options.mode.chained_assignment = None

---------------------------

In [4]:
source_dir = '/data/parastou/RNAdeg/data/AllRNA/'

In [13]:
out_dir = '/data/parastou/RNAdeg/results/RipRna/'

--------------

## Workflow

- Unzip fastq files.
- Truncate 0-length reads from fastq files.
- Run STAR aligner.
- Index .bam files.
- Remove rRNA reads from .bam and add 'GE' tag to reads which map to genes (RemrRNA).
- Calculate and save gene counts table for all given samples from .bam files.
- TPM-normalize gene counts table and save.

---------

### 1 - Prepare output folders

In [14]:
out_bam = os.path.join(out_dir, 'bams')
out_tagged = os.path.join(out_dir, 'tagged_bams')
xp_data = os.path.join(out_dir, 'xp_data')

In [7]:
if not os.path.isdir(out_bam):    
    !mkdir $out_bam

In [8]:
if not os.path.isdir(out_tagged):    
    !mkdir $out_tagged

In [9]:
if not os.path.isdir(xp_data):    
    !mkdir $xp_data

-----------------------

### 2 - Unzip fastq files.

In [13]:
for filename in os.listdir(source_dir):
    if filename.endswith('bz2'):
        
        filepath = os.path.join(source_dir, filename)
        !bzip2 -d $filepath

### Optional step: remove truncated reads from fastq files.

In [None]:
for filename in os.listdir(source_dir):
    if filename.endswith('.fastq'):
        
        filepath = os.path.join(source_dir, filename)
        outfilepath = os.path.join(source_dir, filename.split('.fastq')[0] + '.ztr.fastq')
        !echo $outfilepath
        !bioawk -cfastx 'length($seq) > 0 {print "@"$name"\n"$seq"\n+\n"$qual} $filepath  >> $outfilepath'
        #!rm $filepath

### 3 - Align fastq files

In [10]:
# Set STAR runtime parameters.
def star_command(in_file):
    
    star = '/home/parastou/star_2.5.1b'
    n_threads = 20
    genome_dir = '/data/parastou/RNAdeg/genomes/spombe/staridx/'
    
    command = star + ' --runThreadN ' + str(n_threads) + ' --genomeDir ' + genome_dir + ' --readFilesIn ' \
    + source_dir + in_file + ' --outFileNamePrefix ' + os.path.join(out_bam , in_file.split('fastq')[0]) \
    + ' --outSAMtype BAM SortedByCoordinate'
    
    return command

In [18]:
# Run STAR
for filename in os.listdir(source_dir):
    
    if filename.endswith(('.fastq', '.fastqsanger')):
        print(filename)
        command = star_command(filename)
        print command
        !$command

63_S2ChIPp.ztr.fastq
/home/parastou/star_2.5.1b --runThreadN 20 --genomeDir /data/parastou/RNAdeg/genomes/spombe/staridx/ --readFilesIn /data/parastou/RNAdeg/data/AllChIP/63_S2ChIPp.ztr.fastq --outFileNamePrefix /data/parastou/RNAdeg/results/RipChip/bams/63_S2ChIPp.ztr. --outSAMtype BAM SortedByCoordinate
Oct 05 10:08:35 ..... Started STAR run
Oct 05 10:08:35 ..... Loading genome
Oct 05 10:08:35 ..... Started mapping
Oct 05 10:10:42 ..... Started sorting BAM
Oct 05 10:10:46 ..... Finished successfully


In [19]:
# Index alignment files.
for filename in os.listdir(out_bam):
    
    if filename.endswith('.Aligned.sortedByCoord.out.bam'):
        filepath = os.path.join(out_bam, filename)
        !samtools index $filepath

samtools index: "/data/parastou/RNAdeg/results/RipChip/bams/63_S2Ph_ChIPp.Aligned.sortedByCoord.out.bam" is in a format that cannot be usefully indexed


### 4 - Tag bam files with gene IDs

In [20]:
# Set RemrRNA command runtime parameters.
def remrrna_command():
    
    remrrna = '/data/parastou/RNAdeg/pyRNAdeg/RemrRNA.py'
    annotation = '/data/parastou/RNAdeg/annotation/schizosaccharomyces_pombe.chr.extended.csv'
    
    command = remrrna + ' -d ' + out_bam + ' -g ' + annotation + ' -o ' + out_tagged
    
    return command

In [None]:
# Run RemrRNA
remrrna = remrrna_command()
!$remrrna

### 5 - Compute raw and tpm-normalized gene count tables.

In [15]:
# Set GeneExpressionTable runtime parameters.
def gxt_command():
    
    gxt = '/data/parastou/RNAdeg/pyRNAdeg/GeneExpressionTable.py'
    annotation = '/data/parastou/RNAdeg/annotation/schizosaccharomyces_pombe.chr.extended.csv'
    
    command = 'python ' + gxt + ' -d ' + out_tagged + ' -g ' + annotation + ' -o ' + xp_data
    
    return command

In [16]:
# Run GeneExpressionTable
gxt = gxt_command()
!$gxt

Call to GeneExpressionTable module....

This module calculates gene expression in given alignment file(s).
Input: folder containing .bam file(s)
Output: raw and tpm gene counts data (.csv) files
----------------------------------------
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/63_RNA_pA_3.bam
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/63_RNA_pA_4.bam
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/301_S2RIP_3.bam
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/302S2RIP_1.bam
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/530S2RIP_1.bam
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/63.bam
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/65.bam
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/80_RNA_pA.bam
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/504S2RIP_2.bam
Input bam: /data/parastou/RNAdeg/results/RipRna/tagged_bams/80pARNA_2.bam
Input bam: /data/parastou/RNA

----------------