In [1]:
import os

import pandas as pd

In [3]:
pd.options.mode.chained_assignment = None

---------------------------

In [4]:
source_dir = '/data/parastou/RNAdeg/data/AllChIP/'

In [5]:
out_dir = '/data/parastou/RNAdeg/results/RipChip/'

--------------

## Workflow

- Unzip fastq files.
- Truncate 0-length reads from fastq files (optional).
- Run STAR aligner.
- Index .bam files.
- Calculate and save gene counts table for all given samples from .bam files.
- TPM-normalize gene counts table and save.

---------

### 1 - Prepare output folders

In [6]:
out_bam = os.path.join(out_dir, 'bams')
xp_data = os.path.join(out_dir, 'xp_data')

In [7]:
if not os.path.isdir(out_bam):    
    !mkdir $out_bam

In [10]:
if not os.path.isdir(xp_data):    
    !mkdir $xp_data

-----------------------

### 2 - Unzip fastq files.

In [13]:
for filename in os.listdir(source_dir):
    if filename.endswith('bz2'):
        
        filepath = os.path.join(source_dir, filename)
        !bzip2 -d $filepath

### Optional step: remove truncated reads from fastq files.

In [None]:
for filename in os.listdir(source_dir):
    if filename.endswith('.fastq'):
        
        filepath = os.path.join(source_dir, filename)
        outfilepath = os.path.join(source_dir, filename.split('.fastq')[0] + '.ztr.fastq')
        !echo $outfilepath
        !bioawk -cfastx 'length($seq) > 0 {print "@"$name"\n"$seq"\n+\n"$qual} $filepath  >> $outfilepath'
        !rm $filepath

### 3 - Align fastq files

In [12]:
# Set STAR runtime parameters.
def star_command(in_file):
    
    star = '/home/parastou/star_2.5.1b'
    n_threads = 4
    genome_dir = '/data/parastou/RNAdeg/genomes/spombe/star_nogtf_idx/'
    
    command = star + ' --runThreadN ' + str(n_threads) + ' --genomeDir ' + genome_dir + ' --readFilesIn ' \
    + source_dir + in_file + ' --outFileNamePrefix ' + os.path.join(out_bam , in_file.split('fastq')[0]) \
    + ' --outSAMtype BAM SortedByCoordinate --alignIntronMax 1 --alignEndsType EndToEnd'
    
    return command

In [18]:
# Run STAR
for filename in os.listdir(source_dir):
    
    if filename.endswith(('.fastq', '.fastqsanger')):
        print(filename)
        command = star_command(filename)
        !$command

63_S2ChIPp.ztr.fastq
Jan 22 09:32:40 ..... Started STAR run
Jan 22 09:32:40 ..... Loading genome
Jan 22 09:32:48 ..... Started mapping
Jan 22 09:33:37 ..... Started sorting BAM
Jan 22 09:33:43 ..... Finished successfully
63_S2ChIP_2.ztr.fastq
Jan 22 09:33:43 ..... Started STAR run
Jan 22 09:33:43 ..... Loading genome
Jan 22 09:33:44 ..... Started mapping
Jan 22 09:34:33 ..... Started sorting BAM
Jan 22 09:34:38 ..... Finished successfully
65_S2ChIP.fastq
Jan 22 09:34:39 ..... Started STAR run
Jan 22 09:34:39 ..... Loading genome
Jan 22 09:34:39 ..... Started mapping
Jan 22 09:35:57 ..... Started sorting BAM
Jan 22 09:36:05 ..... Finished successfully
80_S2ChIP_2.fastq
Jan 22 09:36:05 ..... Started STAR run
Jan 22 09:36:05 ..... Loading genome
Jan 22 09:36:06 ..... Started mapping
Jan 22 09:37:07 ..... Started sorting BAM
Jan 22 09:37:15 ..... Finished successfully
80_S2ChIP.fastq
Jan 22 09:37:15 ..... Started STAR run
Jan 22 09:37:15 ..... Loading genome
Jan 22 09:37:15 ..... Started m

In [20]:
# Index alignment files.
for filename in os.listdir(out_bam):
    
    if filename.endswith('.Aligned.sortedByCoord.out.bam'):
        filepath = os.path.join(out_bam, filename)
        !samtools index $filepath

### 4 - Compute raw and tpm-normalized gene count tables.

In [7]:
# Set GeneExpressionTable runtime parameters.
def gxt_command():
    
    gxt = '/data/parastou/RNAdeg/pyRNAdeg/GeneExpressionTableChIP.py'
    annotation = '/data/parastou/RNAdeg/annotation/schizosaccharomyces_pombe.chr.extended.csv'
    
    command = 'python ' + gxt + ' -d ' + out_bam + ' -g ' + annotation + ' -o ' + xp_data + ' -x ' + 'chip_'
    
    return command

In [8]:
# Run GeneExpressionTable
gxt = gxt_command()
!$gxt

Call to GeneExpressionTable module....

This module calculates gene expression in given alignment file(s).
Input: folder containing .bam file(s)
Output: raw and tpm gene counts data (.csv) files
----------------------------------------
Input bam: /data/parastou/RNAdeg/results/RipChip/bams/63_S2ChIPp.ztr.Aligned.sortedByCoord.out.bam
Total number of spliced alignments : 0
Input bam: /data/parastou/RNAdeg/results/RipChip/bams/63_S2ChIP_2.ztr.Aligned.sortedByCoord.out.bam
Total number of spliced alignments : 0
Input bam: /data/parastou/RNAdeg/results/RipChip/bams/65_S2ChIP.Aligned.sortedByCoord.out.bam
Total number of spliced alignments : 0
Input bam: /data/parastou/RNAdeg/results/RipChip/bams/80_S2ChIP_2.Aligned.sortedByCoord.out.bam
Total number of spliced alignments : 0
Input bam: /data/parastou/RNAdeg/results/RipChip/bams/80_S2ChIP.Aligned.sortedByCoord.out.bam
Total number of spliced alignments : 0
Input bam: /data/parastou/RNAdeg/results/RipChip/bams/80_S2Ph_ChIP.Aligned.sortedByCoo

----------------