In [1]:
import pandas as pd

In [2]:
import os

In [3]:
import sys

In [4]:
pd.options.mode.chained_assignment = None

---------------------------

In [5]:
env = sys.base_prefix
env_bin = os.path.join(env, 'bin')

In [6]:
python = sys.executable

In [7]:
bioawk = os.path.join(env_bin, "bioawk")
#bioawk

---------------------------

In [8]:
#project_data_dir = '/data/pablo/RNAdeg' # algbio /data
project_data_dir = '/gcm-lfs1/pablo/data/RNAdeg' # algbio /gcm-lfs1

In [9]:
project_dir = '/home/pmonteagudo/workspace/RNAdeg'

In [10]:
data_batch = 'ChIP'

---------------------------

In [11]:
#source_dir = '/data/parastou/RNAdeg/data/AllChIP/'
#source_dir = os.path.join(project_data_dir, 'data/ChIP')
#source_dir = os.path.join(project_data_dir, 'data/sequencing_new/ChIP')
source_dir = os.path.join(project_data_dir, 'data', data_batch)

In [12]:
#out_dir = '/data/parastou/RNAdeg/results/RipChip/'
#out_dir = os.path.join(project_data_dir, 'results/ChIP')
#out_dir = os.path.join(project_data_dir, 'results/sequencing_new/ChIP')
out_dir = os.path.join(project_data_dir, 'data', data_batch)

--------------

## Workflow

- Unzip fastq files.
- Truncate 0-length reads from fastq files (optional).
- Run STAR aligner.
- Index .bam files.
- Calculate and save gene counts table for all given samples from .bam files.
- TPM-normalize gene counts table and save.

---------

### Samples that want to be excluded from the analysis!

In [13]:
ignore_files = () ## need to be str or a tuple of str, not list.

---------

### __1__ - Prepare output folders

In [14]:
raw_dir = os.path.join(source_dir, 'raw_data')
fastq_dir = os.path.join(source_dir, 'fastq')

In [15]:
if not os.path.isdir(fastq_dir):    
    !mkdir -p $fastq_dir

In [16]:
out_bam = os.path.join(out_dir, 'bams')
xp_data = os.path.join(out_dir, 'xp_data')

In [17]:
if not os.path.isdir(out_bam):    
    !mkdir -p $out_bam

In [18]:
if not os.path.isdir(xp_data):    
    !mkdir -p $xp_data

-----------------------

### __2__ - Unzip fastq files.

Unzip files in the raw_data folder which contains all compressed files and move the uncompressed .fastq files to the fastq folder.

In [19]:
#os.listdir(raw_dir)

In [20]:
for filename in os.listdir(raw_dir):
    
    #if filename.endswith('.bz2'):
    if filename.endswith('.bz2') and not filename.startswith(ignore_files):
        
        ## compressed file
        raw_file = os.path.join(raw_dir, filename)
        
        ## final .fastq file in /fastq Directory
        fastq_file = os.path.join(fastq_dir, os.path.splitext(filename)[0])

        ## check if it's has been uncompressed already
        if not os.path.isfile(os.path.join(fastq_dir, fastq_file)):

            print('Decompressing file (may take a while) ...\n')
            !bzip2 -kdv $raw_file
            print('\nDone.\n')

            fastq_file = os.path.splitext(raw_file)[0]
            ## move uncompressed .fastq file to /fastq dir
            !mv $fastq_file $fastq_dir

        else:
            pass
            

### Optional step: remove truncated reads from fastq files.

In [21]:
#for filename in os.listdir(fastq_dir):
#    #if filename.endswith('.fastq'):
#    if filename.endswith('.fastq') and not filename.startswith(ignore_files)
#        
#        filepath = os.path.join(fastq_dir, filename)
#        outfilepath = os.path.join(fastq_dir, filename.split('.fastq')[0] + '.ztr.fastq')
#        print(filepath, '...')
#        print(outfilepath, '...\n')
#
#        !echo $outfilepath
#        #!bioawk -cfastx 'length($seq) > 0 {print "@"$name"\n"$seq"\n+\n"$qual} $filepath  >> $outfilepath'
#        !$bioawk -cfastx 'length($seq) > 0 {print "@"$name"\n"$seq"\n+\n"$qual} $filepath  >> $outfilepath'
#        ## carefull this will delete all input files!!
#        !rm $filepath

### __3__ - Align fastq files

In [22]:
# Set STAR runtime parameters.
def star_command(filename):
    
    #star = '/home/parastou/star_2.5.1b'
    #star = '/home/pmonteagudo/Software/STAR/bin/Linux_x86_64/STAR' ## STAR_2.6.0a
    star = 'STAR' ## STAR_2.5.4b
    n_threads = 4
    
    #genome_dir = '/data/parastou/RNAdeg/genomes/spombe/star_nogtf_idx/'
    genome_dir = os.path.join(project_data_dir, 'genomes/spombe/star_nogtf_idx')
    
    ## reads to align
    fastq_file = os.path.join(fastq_dir, filename)
    
    sample_prefix =  os.path.splitext(filename)[0]
    bam_dir = os.path.join(out_bam, sample_prefix)
    if not os.path.isdir(bam_dir):
        !mkdir -p $bam_dir
        
    ## create bam file inside individual folder for each sample
    bam_prefix = os.path.join(bam_dir, sample_prefix + '.')
    
    ## different with RNAseq (no splicing): --alignIntronMax 1 --alignEndsType EndToEnd
    command = star + ' --runThreadN ' + str(n_threads) + ' --genomeDir ' + genome_dir + ' --readFilesIn ' \
    + fastq_file + ' --outFileNamePrefix ' + bam_prefix + ' --outSAMtype BAM SortedByCoordinate --alignIntronMax 1 --alignEndsType EndToEnd'
        
    return command

- __Run STAR__


In [23]:
list_fastq = os.listdir(fastq_dir)
#list_fastq = [list_fastq[0]]

In [24]:
i=1
for filename in list_fastq:
    
    #if filename.endswith(('.fastq', '.fastqsanger')):
    if filename.endswith(('.fastq', '.fastqsanger')) and not filename.startswith(ignore_files):
        
        print("\n({}/{}). Aligning (reads) .fastq files: {} ... ".format(i, len(list_fastq), filename), '\n')
        
        command = star_command(filename)
        print(command)
        !$command
        print("Done.\n")
        
        #import pdb
        #pdb.set_trace()
        
    i+=1


(1/30). Aligning (reads) .fastq files: 1168_S2ChIP.fastq ...  

STAR --runThreadN 4 --genomeDir /gcm-lfs1/pablo/data/RNAdeg/genomes/spombe/star_nogtf_idx --readFilesIn /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/fastq/1168_S2ChIP.fastq --outFileNamePrefix /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/1168_S2ChIP/1168_S2ChIP. --outSAMtype BAM SortedByCoordinate --alignIntronMax 1 --alignEndsType EndToEnd
Dec 05 15:46:29 ..... started STAR run
Dec 05 15:46:29 ..... loading genome
Dec 05 15:46:30 ..... started mapping
Dec 05 15:47:15 ..... started sorting BAM
Dec 05 15:47:18 ..... finished successfully
Done.


(2/30). Aligning (reads) .fastq files: 301_S2_ChIP.fastq ...  

STAR --runThreadN 4 --genomeDir /gcm-lfs1/pablo/data/RNAdeg/genomes/spombe/star_nogtf_idx --readFilesIn /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/fastq/301_S2_ChIP.fastq --outFileNamePrefix /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/301_S2_ChIP/301_S2_ChIP. --outSAMtype BAM SortedByCoordinate --alignIntronMax 1 --alignEndsType EndT

- __Index alignment files__

In [25]:
list_bams = []

## now samples '.bam' and '.sam' files are inside individual directories.
for root, dirs, files in os.walk(out_bam, topdown=True):
    for name in files:
        
        #if name.endswith('.bam') or name.endswith('.sam') and (not name.startswith(ignore_files)):
        if (name.endswith('.Aligned.sortedByCoord.out.bam')) and (not name.startswith(ignore_files)):

          #print(os.path.join(root, name))
          list_bams.append(os.path.join(root, name))

In [26]:
#list_bams

In [27]:
i=1
for filename in list_bams:
    
    #if filename.endswith('.Aligned.sortedByCoord.out.bam'):
    #if filename.endswith('.Aligned.sortedByCoord.out.bam') and not filename.startswith(ignore_files):

    filepath = os.path.join(out_bam, filename)
    print("\n({}/{}). Indexing (alignned reads) .bam files ...".format(i, len(list_bams)))
    print(" {}".format(filepath))
    
    !samtools index $filepath
    print(" Done.\n")
    
    i+=1


(1/27). Indexing (alignned reads) .bam files ...
 /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/1168_S2ChIP/1168_S2ChIP.Aligned.sortedByCoord.out.bam
 Done.


(2/27). Indexing (alignned reads) .bam files ...
 /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/301_S2_ChIP/301_S2_ChIP.Aligned.sortedByCoord.out.bam
 Done.


(3/27). Indexing (alignned reads) .bam files ...
 /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/301_S2ChIP/301_S2ChIP.Aligned.sortedByCoord.out.bam
 Done.


(4/27). Indexing (alignned reads) .bam files ...
 /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/302_S2_ChIP/302_S2_ChIP.Aligned.sortedByCoord.out.bam
 Done.


(5/27). Indexing (alignned reads) .bam files ...
 /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/302_S2ChIP/302_S2ChIP.Aligned.sortedByCoord.out.bam
 Done.


(6/27). Indexing (alignned reads) .bam files ...
 /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/324_S2_ChIP/324_S2_ChIP.Aligned.sortedByCoord.out.bam
 Done.


(7/27). Indexing (alignned reads) .bam files ...
 /gcm-lfs1/pablo/d

### __4__ - Compute raw and tpm-normalized gene count tables.

In [28]:
# Set GeneExpressionTable runtime parameters.
def gxt_command():
    
    ## gene expression table
    #gxt = '/data/parastou/RNAdeg/pyRNAdeg/GeneExpressionTableChIP.py'
    #gxt = os.path.join(project_dir, 'pyRNAdeg/GeneExpressionTableChIP.py')
    ## it's the same script as 'GeneExpressionTableChIP.py'
    gxt = os.path.join(project_dir, 'pyRNAdeg/gene_expression_table.py')
    #gxt = os.path.join(project_dir, 'pyRNAdeg/gene_expression_table_extended.py')

    #annotation = '/data/parastou/RNAdeg/annotation/schizosaccharomyces_pombe.chr.extended.csv'
    annotation =  os.path.join(project_data_dir, 'annotation/schizosaccharomyces_pombe.chr.extended.csv')
    
    ## this would use the default python executable not the current kernel/env
    #command = 'python ' + gxt + ' -d ' + out_bam + ' -g ' + annotation + ' -o ' + xp_data + ' -x ' + 'chip_'
    command = python + ' ' + gxt + ' -d ' + out_bam + ' -g ' + annotation + ' -o ' + xp_data + ' -x ' + 'chip_'
    #command =  gxt + ' -d ' + out_bam + ' -g ' + annotation + ' -o ' + xp_data + ' -x ' + 'chip_'

    return command

In [29]:
# Run GeneExpressionTable - Problem: logging is not output
#gxt = gxt_command()
#%run $gxt

In [30]:
# Run GeneExpressionTable 
gxt = gxt_command()
!$gxt

Call to GeneExpressionTable module....

This module calculates gene expression in given alignment file(s).
Input: folder containing .bam file(s)
Output: raw and tpm gene counts data (.csv) files
----------------------------------------
Input bam: /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/1168_S2ChIP/1168_S2ChIP.Aligned.sortedByCoord.out.bam
Total number of alignments : 2785741
Spliced alignments : 0
Non spliced : 2785741
Input bam: /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/301_S2_ChIP/301_S2_ChIP.Aligned.sortedByCoord.out.bam
Total number of alignments : 5250369
Spliced alignments : 0
Non spliced : 5250369
Input bam: /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/301_S2ChIP/301_S2ChIP.Aligned.sortedByCoord.out.bam
Total number of alignments : 5413008
Spliced alignments : 0
Non spliced : 5413008
Input bam: /gcm-lfs1/pablo/data/RNAdeg/data/ChIP/bams/302_S2_ChIP/302_S2_ChIP.Aligned.sortedByCoord.out.bam
Total number of alignments : 5656101
Spliced alignments : 0
Non spliced : 5656101
Input 

----------------