In [1]:
import pandas as pd

In [2]:
import os

In [3]:
from functools import reduce

In [4]:
pd.options.mode.chained_assignment = None

---------------------------

In [23]:
source_dir = '/data/parastou/RNAdeg/data/AllRNA/'
#source_dir = '/data/RNAdeg/fastq/RNA/'

In [24]:
#out_dir = '/data/parastou/RNAdeg/results/RipRna/'
out_dir = '/data/pablo/RNAdeg/results/RipRna/'

--------------

## Workflow

- Unzip fastq files.
- Truncate 0-length reads from fastq files.
- Run STAR aligner.
- Index .bam files.
- Remove rRNA reads from .bam and add 'GE' tag to reads which map to genes (RemrRNA).
- Calculate and save gene counts table for all given samples from .bam files.
- TPM-normalize gene counts table and save.

---------

In [25]:
def process_sample_names(names, col_name=None):
    
    ## select col_name
    if col_name is None:
        if isinstance(names, pd.DataFrame) and names.shape[0] == 1:
            col_name = names.columns[0]
        else:
            col_name = "sample_name"
    
    ## get dataframe
    if not isinstance(names, pd.DataFrame):

        ## create dataframe from list
        df = pd.DataFrame(data = names, columns=[col_name])
        
    else:
        
        ## already a dataframe
        df = names
        
    ## sort dataframe by col_name
    df = df.sort_values(by=[col_name]).reset_index(drop=True)
        
    ## get sample's prefix
    df["prefix"] = df[col_name].map(lambda x: x.split(".")[0])
    
    ## check for "duplicated" files (share same prefix) within the same directory.
    ## one should be careful and wonder why is that?
    n_samples = df.shape[0]
    df = df[~df.duplicated(subset="prefix", keep=False)] ## remove entries with duplicated prefix
    
    #assert(n_samples == df.shape[0])
    if n_samples != df.shape[0]:
        print("\n{}".format("-"*99))
        print(" Warning! Duplicated files (share same prefix) within the same directory. \n Will be ignored for now!")        
        print("{}\n".format("-"*99))

    #print("Number of files in `{}`: {}".format(col_name, df.shape[0]) )
    
    return df

## 0 - Investigate Samples

---------

### Import Data

A. __Load `valid_samples.txt` file__ 

In [27]:
valid_samples = pd.read_csv(source_dir + 'valid_samples.txt', header=None, names = ["valid_sample"])
valid_samples = process_sample_names(valid_samples, col_name="valid_sample")
print("Number of valid samples (as given by Parastou): {}".format(valid_samples.shape[0]) )

Number of valid samples (as given by Parastou): 54


In [33]:
#valid_samples

B. __Load `remote_RNA_file_names.txt` file__ 

In [34]:
remote_samples = pd.read_csv(source_dir + 'remote_RNA_file_names.txt', header=None, names = ["remote_sample"])
remote_samples = process_sample_names(remote_samples, col_name="remote_sample")
remote_dir = "/data/cryohalic01/home/ag_halic/share/Conny/fastq_for_Stefan_RNAdeg"
print("Number of remote samples (as present in {}): {}".format(remote_dir, remote_samples.shape[0]))

Number of remote samples (as present in /data/cryohalic01/home/ag_halic/share/Conny/fastq_for_Stefan_RNAdeg): 32


In [35]:
#remote_samples

C. __Check in `source_dir` for sample files ('bz2', '.fastq', '.fastqsanger')__

In [36]:
file_formats = ('bz2', '.fastq', '.fastqsanger')
source_samples_names = [ff for ff in os.listdir(source_dir) if ff.endswith(file_formats)]
#source_samples_names = pd.read_csv(source_dir + 'sample_names.txt', header=None, names = ["source_sample"])
source_samples = process_sample_names(source_samples_names, col_name="source_sample")
print("Number of sample files in `source_dir` ({}): {}".format(source_dir, source_samples.shape[0]))

Number of sample files in `source_dir` (/data/parastou/RNAdeg/data/AllRNA/): 66


In [37]:
#source_samples

- This means there are __"duplicated" files__ (share same prefix) within the same directory. One should be careful and wonder __why is that__?

In [38]:
assert(len(source_samples_names) == source_samples.shape[0])

In [39]:
duplicated_files = sorted(list(set(source_samples_names).difference(source_samples.source_sample)))
duplicated_files

[]

D. __Merge (Samples) DataFrames: [`source_dir`, `valid_samples.txt`, `remote_RNA_file_names.txt`]__

In [45]:
# compile the list of dataframes you want to merge
data_frames = [valid_samples, remote_samples, source_samples]

In [46]:
df_merged = reduce(lambda left, right: pd.merge(left,right,on=['prefix'], how='outer'), data_frames)

In [47]:
# sort
df_merged = df_merged.sort_values("prefix").reset_index(drop=True)
# shuffle columns
df_merged = df_merged[["prefix", "valid_sample", "remote_sample", "source_sample"]]
# set "prefix" as index
df_merged = df_merged.set_index("prefix")

In [48]:
df_merged.shape[0]

67

In [49]:
#df_merged

---------

### __Compare  `source_dir` vs `valid_samples.txt`__

In [57]:
select_cols = ["valid_sample", "source_sample"]
source_vs_valid_samples = df_merged[select_cols]

This are samples that are either __not valid__ or __missing__

In [51]:
source_vs_valid_samples = source_vs_valid_samples[source_vs_valid_samples["valid_sample"].isna() != source_vs_valid_samples["source_sample"].isna()]
len(source_vs_valid_samples)

14

- __Not valid__ samples (present in `source_dir` but not in `valid_samples.txt`)

In [52]:
not_valid_samples = source_vs_valid_samples[source_vs_valid_samples.valid_sample.isna()]
print("Number of NOT valid samples:", len(not_valid_samples))

Number of NOT valid samples: 13


In [56]:
#not_valid_samples

- __Missing__ samples (present in `valid_samples.txt` but not in `source_dir`): Ideally this should be 0

In [54]:
missing_samples = source_vs_valid_samples[source_vs_valid_samples.source_sample.isna()]
print("Number of missing samples (e.g. valid but are not in the `source_dir`:", len(missing_samples))

Number of missing samples (e.g. valid but are not in the `source_dir`: 1


- We see that some of the `valid_sample`s (7) seem to be missing an "A" at the end.
- (1) `valid_sample` "302_S2RIP_2" is also missing but this the sample that we ignore because is raising an error.

In [55]:
missing_samples

Unnamed: 0_level_0,valid_sample,source_sample
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1
302_S2RIP_2,302_S2RIP_2,


### __Compare  `source_dir` vs `remote_RNA_file_names.txt`__

In [58]:
select_cols = ["source_sample", "remote_sample"]
source_vs_remote_samples = df_merged[select_cols]

This are samples that are either present in __missing from source_dir__ or __missing from remote_dir__

In [59]:
source_vs_remote_samples = source_vs_remote_samples[source_vs_remote_samples["source_sample"].isna() != source_vs_remote_samples["remote_sample"].isna()]
len(source_vs_remote_samples)

36

In [61]:
#source_vs_remote_samples

- __Missing from remote_dir__ samples (present in `source_dir` but not in `remote_RNA_file_names.txt`)

In [63]:
missing_from_remote = source_vs_remote_samples[source_vs_remote_samples.remote_sample.isna()]
print("Number of samples missing in the source_dir:", len(missing_from_remote))

Number of samples missing in the source_dir: 35


In [71]:
missing_from_remote.index.tolist()

['1167_S5RIP_2',
 '283_RNA_pA_4',
 '301_RNA_pA_3',
 '301_S2RIP_3',
 '302_S2RIP_3',
 '324S2RIP_1',
 '324_RNA_pA_3',
 '324_S2RIP_3',
 '491S2RIP_1',
 '491_S2RIP_3',
 '504S2RIP_1',
 '504S2RIP_2',
 '504_RNA_pA_1',
 '504_RNA_pA_2',
 '530S2RIP_1',
 '530S2RIP_2',
 '530_RNA_pA_1',
 '530_RNA_pA_2',
 '591_S5RIP_1',
 '63',
 '638S2RIP_1',
 '638S2RIP_2',
 '638_RNA_pA_1',
 '638_RNA_pA_2',
 '63_RIPS5P',
 '63_RNA_pA_3',
 '63_RNA_pA_4',
 '63_S2PRIP',
 '63_S2Ph_RIP',
 '63_S2RIP_2',
 '63_S5Ph_RIP',
 '65',
 '80S2RIP_1',
 '80S2RIP_2',
 '80pARNA_2']

- __Missing from source_dir__ samples (present in `remote_RNA_file_names.txt` but not in `source_dir`)

In [66]:
missing_from_source = source_vs_remote_samples[source_vs_remote_samples.source_sample.isna()]
print("Number of missing samples (e.g. valid but are not in the `source_dir`:", len(missing_from_source))

Number of missing samples (e.g. valid but are not in the `source_dir`: 1


- (1) `remote_sample` "302_S2RIP_2" is missing but this the sample that we ignore because is raising an error.

In [68]:
missing_from_source

Unnamed: 0_level_0,source_sample,remote_sample
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1
302_S2RIP_2,,302_S2RIP_2.fastq.bz2


---------

### Samples used in Analysis

__Errors__:
-  302_S2RIP_2.fastq
-  63_RNA_pA_3.fastq

__Ignore samples that are not valid, also ignore samples that give errors__

In [44]:
#ignore_files = set(["302_S2RIP_2", "63_RNA_pA_3"])
ignore_files = set(["302_S2RIP_2"]) ## we already removed the file from directory
## union
ignore_files = set(not_valid_samples.index | ignore_files)

In [45]:
ignore_files = sorted([ff + "."  for ff in ignore_files])
print("Number of samples that will be ignored:", len(ignore_files))

Number of samples that will be ignored: 22


In [46]:
#ignore_files

__Samples used in Analysis are present in `source_dir` and `valid_samples.txt`__

In [47]:
prefix_files = df_merged[(~df_merged.source_sample.isna()) & (~df_merged.valid_sample.isna())].index.tolist()
print("Total number of samples that will be analyzed:", len(prefix_files))

Total number of samples that will be analyzed: 46


In [48]:
#prefix_files

---------

### 1 - Prepare output folders

In [12]:
out_bam = os.path.join(out_dir, 'bams')
out_tagged = os.path.join(out_dir, 'tagged_bams')
xp_data = os.path.join(out_dir, 'xp_data')

In [13]:
if not os.path.isdir(out_bam):    
    #!mkdir $out_bam
    !mkdir -p $out_bam

In [14]:
if not os.path.isdir(out_tagged):    
    #!mkdir $out_tagged
    !mkdir -p $out_tagged

In [15]:
if not os.path.isdir(xp_data):    
    #!mkdir $xp_data
    !mkdir -p $xp_data

-----------------------

### 2 - Unzip fastq files.

In [14]:
for filename in os.listdir(source_dir):
    if filename.endswith('bz2'):
        cat('Decompressing file ...\n', filename)
        filepath = os.path.join(source_dir, filename)
        !bzip2 -d $filepath

### Optional step: remove truncated reads from fastq files.

In [15]:
# for filename in os.listdir(source_dir):
#     if filename.endswith('.fastq'):
        
#         filepath = os.path.join(source_dir, filename)
#         outfilepath = os.path.join(source_dir, filename.split('.fastq')[0] + '.ztr.fastq')
#         !echo $outfilepath
#         !bioawk -cfastx 'length($seq) > 0 {print "@"$name"\n"$seq"\n+\n"$qual} $filepath  >> $outfilepath'
#         #!rm $filepath

### 3 - Align fastq files

In [16]:
# Set STAR runtime parameters.
def star_command(in_file):
    
    star = '/home/parastou/star_2.5.1b'
    n_threads = 20
    genome_dir = '/data/parastou/RNAdeg/genomes/spombe/staridx/'
    
    command = star + ' --runThreadN ' + str(n_threads) + ' --genomeDir ' + genome_dir + ' --readFilesIn ' \
    + source_dir + in_file + ' --outFileNamePrefix ' + os.path.join(out_bam , in_file.split('fastq')[0]) \
    + ' --outSAMtype BAM SortedByCoordinate'
    
    return command

__Errors__:
-  302_S2RIP_2.fastq:
    - EXITING because of fatal ERROR: not enough memory for BAM sorting: 
        - SOLUTION: re-run STAR with at least --limitBAMsortRAM 2821915860
-  63_RNA_pA_3.fastq
    - EXITING because of FATAL ERROR in reads input: short read sequence line: 1 
        - Read Name=@L183:414:CCGY1ANXX:4:1311:17580:12058:1:N:0:0:CACGATAT
        - Read Sequence====
        - DEF_readNameLengthMax=50000
        - DEF_readSeqLengthMax=500


In [18]:
# Run STAR
for filename in os.listdir(source_dir):
    
    #if filename.endswith(('.fastq', '.fastqsanger')):
    if filename.endswith(('.fastq', '.fastqsanger')) and not filename.startswith(ignore_files):
   
        print(filename)
        command = star_command(filename)
        print(command)
        !$command

1167_pA_2.fastq
/home/parastou/star_2.5.1b --runThreadN 20 --genomeDir /data/parastou/RNAdeg/genomes/spombe/staridx/ --readFilesIn /data/parastou/RNAdeg/data/AllRNA/1167_pA_2.fastq --outFileNamePrefix /data/pablo/RNAdeg/results/RipRna/bams/1167_pA_2. --outSAMtype BAM SortedByCoordinate
Nov 20 11:31:14 ..... Started STAR run
Nov 20 11:31:14 ..... Loading genome
Nov 20 11:31:25 ..... Started mapping
Nov 20 11:32:27 ..... Started sorting BAM
Nov 20 11:32:41 ..... Finished successfully
63.fastq
/home/parastou/star_2.5.1b --runThreadN 20 --genomeDir /data/parastou/RNAdeg/genomes/spombe/staridx/ --readFilesIn /data/parastou/RNAdeg/data/AllRNA/63.fastq --outFileNamePrefix /data/pablo/RNAdeg/results/RipRna/bams/63. --outSAMtype BAM SortedByCoordinate
Nov 20 11:32:42 ..... Started STAR run
Nov 20 11:32:42 ..... Loading genome
Nov 20 11:32:43 ..... Started mapping
Nov 20 11:33:33 ..... Started sorting BAM
Nov 20 11:33:49 ..... Finished successfully
65.fastq
/home/parastou/star_2.5.1b --runThread

__Errors__:
-  302_S2RIP_2.fastq:
    - samtools index: "/data/pablo/RNAdeg/results/RipRna/bams/302_S2RIP_2.Aligned.sortedByCoord.out.bam" is in a format that cannot be usefully indexed

-  63_RNA_pA_3.fastq
    - samtools index: "/data/pablo/RNAdeg/results/RipRna/bams/63_RNA_pA_3.Aligned.sortedByCoord.out.bam" is in a format that cannot be usefully indexed

In [19]:
# Index alignment files.
for filename in os.listdir(out_bam):
 
    #if filename.endswith('.Aligned.sortedByCoord.out.bam'):
    if filename.endswith('.Aligned.sortedByCoord.out.bam') and not filename.startswith(ignore_files):

        filepath = os.path.join(out_bam, filename)
        !samtools index $filepath

samtools index: "/data/pablo/RNAdeg/results/RipRna/bams/302_S2RIP_2.Aligned.sortedByCoord.out.bam" is in a format that cannot be usefully indexed
samtools index: "/data/pablo/RNAdeg/results/RipRna/bams/63_RNA_pA_3.Aligned.sortedByCoord.out.bam" is in a format that cannot be usefully indexed


### 4 - Tag bam files with gene IDs

In [16]:
# Set RemrRNA command runtime parameters.
def remrrna_command():
    
    remrrna = '/data/parastou/RNAdeg/pyRNAdeg/RemrRNA.py'
    annotation = '/data/parastou/RNAdeg/annotation/schizosaccharomyces_pombe.chr.extended.csv'
    
    command = remrrna + ' -d ' + out_bam + ' -g ' + annotation + ' -o ' + out_tagged
    
    return command

In [17]:
# Run RemrRNA
remrrna = remrrna_command()
!$remrrna

INFO:__main__:Call to RemrRNA module.
Call to RemrRNA module.
INFO:__main__:

INFO:__main__:This module removes ribosomal-RNA related reads from given .bam files
This module removes ribosomal-RNA related reads from given .bam files
INFO:__main__:It produces a gene-tagged .bam file as output.
It produces a gene-tagged .bam file as output.
INFO:__main__:----------------------------------------
----------------------------------------
INFO:__main__:Filtering ribosomal-RNA reads and tagging started.
Filtering ribosomal-RNA reads and tagging started.
INFO:__main__:Source directory:	/data/pablo/RNAdeg/results/RipRna/bams
Source directory:	/data/pablo/RNAdeg/results/RipRna/bams
INFO:__main__:--------------------------------------------------
--------------------------------------------------
INFO:__main__:Sample name:	1167_pA_2.Aligned.sortedByCoord.out.bam
Sample name:	1167_pA_2.Aligned.sortedByCoord.out.bam
INFO:__main__:-------------------------
-------------------------
INFO:__main__:Tota

### 5 - Compute raw and tpm-normalized gene count tables.

In [20]:
# Set GeneExpressionTable runtime parameters.
def gxt_command():
    
    #gxt = '/data/parastou/RNAdeg/pyRNAdeg/GeneExpressionTable.py' ## doesn't exist
    ## not sure which one to use:
    gxt = '/data/parastou/RNAdeg/pyRNAdeg/gene_expression_table.py'
    #gxt = '/data/parastou/RNAdeg/pyRNAdeg/gene_expression_table_extended.py' ## strand-specific

    annotation = '/data/parastou/RNAdeg/annotation/schizosaccharomyces_pombe.chr.extended.csv'
    
    command = 'python ' + gxt + ' -d ' + out_tagged + ' -g ' + annotation + ' -o ' + xp_data
    
    return command

In [21]:
# Run GeneExpressionTable
gxt = gxt_command()
!$gxt

Call to GeneExpressionTable module....

This module calculates gene expression in given alignment file(s).
Input: folder containing .bam file(s)
Output: raw and tpm gene counts data (.csv) files
----------------------------------------
Input bam: /data/pablo/RNAdeg/results/RipRna/tagged_bams/1113_pA.Aligned.sortedByCoord.out.tagged.bam
Total number of alignments : 29738993
Spliced alignments : 0
Non spliced : 29738993
Input bam: /data/pablo/RNAdeg/results/RipRna/tagged_bams/301_S2RIP_2.Aligned.sortedByCoord.out.tagged.bam
Total number of alignments : 3236804
Spliced alignments : 0
Non spliced : 3236804
Input bam: /data/pablo/RNAdeg/results/RipRna/tagged_bams/302_S2RIP.Aligned.sortedByCoord.out.tagged.bam
Total number of alignments : 4597762
Spliced alignments : 0
Non spliced : 4597762
Input bam: /data/pablo/RNAdeg/results/RipRna/tagged_bams/491_S2RIP_3.ztr.Aligned.sortedByCoord.out.tagged.bam
Total number of alignments : 3071513
Spliced alignments : 0
Non spliced : 3071513
Input bam: /

----------------