In [None]:
import pandas as pd
import os

# Configure the number of CPU cores to use
import multiprocessing
try:
    # This is a linux only function but is less likely to cause problems than multiprocessing.cpu_count()
    cpuThreads = len(os.sched_getaffinity(0))
except:
    cpuThreads = multiprocessing.cpu_count()

# 1. Create the hg38 RNAdb for tRAX

In this notebook, we will create the RNAdb for tRAX. We will use GRCh38.p14/hg38 as the reference genome with the following files.

In [None]:
gtRNAdb_URL="http://gtrnadb.ucsc.edu/GtRNAdb2/genomes/eukaryota/Hsapi38/hg38-tRNAs.tar.gz"
genome_URL="http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p14/hg38.p14.fa.gz"

if 'hg38.p14.fa' not in os.listdir('../rnadb'):
    # Make directory for the RNAdb
    print("Downloading the genome and tRNAdb files")
    os.makedirs("../rnadb", exist_ok=True)
    # Download the gtRNAdb files
    os.system(f"wget -q -O ../rnadb/tse.tar.gz {gtRNAdb_URL}")
    os.system("tar xzf ../rnadb/tse.tar.gz -C ../rnadb")
    os.system("rm ../rnadb/tse.tar.gz")
    # Download the genome
    os.system(f"wget -q -O - {genome_URL} | gzip -cd > ../rnadb/hg38.p14.fa")
else:
    print("Genome and tRNAdb files already exist")

if 'hg38.p14-tRNAgenome.1.bt2l' not in os.listdir('../rnadb'):
    # Create the tRNAdb
    print("Creating the tRNAdb")
    os.system("../tRAX/maketrnadb.py \
            --databasename='../rnadb/hg38.p14' \
            --genomefile='../rnadb/hg38.p14.fa' \
            --trnascanfile='../rnadb/hg38-tRNAs-detailed.out' \
            --namemapfile='../rnadb/hg38-tRNAs_name_map.txt' \
            > ../rnadb/maketrnadb.log")
    print("Done")
else:
    print("tRNAdb already exists")

# 2. Trim the raw fastq reads

The reads are trimmed and merged using the `SeqPrep` as described in the tRAX. This done by calling the `trimadapters.py` script in tRAX, with raw reads located in `rnaseq/fastq_raw` and the trimmed reads located in `rnaseq/fastq_processed`.

In [None]:
df_run = pd.read_csv('config/runfile.tsv', sep='\t', header=None)
df_run.columns = ['fastq_output', 'Read 1', 'Read 2']
df_run

In [None]:
# Run the trimadapters.py script
if 'trimadapter.log' not in os.listdir('rnaseq'):
    print("Running trimadapters.py...")
    os.system(f"../tRAX/trimadapters.py \
            --runname='trimming' \
            --runfile='config/runfile.tsv' \
            --cores={cpuThreads} \
            > rnaseq/trimadapter.log 2>&1")
    # Move the log files to the rnaseq directory
    print("Moving log files to rnaseq directory...")
    os.system(f"mv trimindex.txt rnaseq/trimindex.txt")
    os.system(f"mv trimming_log.txt rnaseq/trimming_log.txt")
    os.system(f"mv trimming_manifest.txt rnaseq/trimming_manifest.txt")
    os.system(f"mv trimming_sp.pdf rnaseq/trimming_sp.pdf")
    os.system(f"mv trimming_sp.txt rnaseq/trimming_sp.txt")
else:
    print("trimadapter.log already exists")

print("Done.")

# 3. Run tRAX on the trimmed reads

In [None]:
# Make directory for tRAX output
os.makedirs('trax', exist_ok=True)

def run_trax(experimentname, samplefile, exppairs, gtffile, makehub=False):
    if makehub:
        makehub = "--makehub"
    else:
        makehub = ""
    # Run the processsamples.py script
    if f'{experimentname}.log' not in os.listdir('trax'):
        print("Running processsamples.py...")
        os.system(f"../tRAX/processsamples.py \
                --experimentname='{experimentname}' \
                --databasename='../rnadb/hg38.p14' \
                --ensemblgtf='{gtffile}' \
                --samplefile='{samplefile}' \
                --exppairs='{exppairs}' \
                --cores={cpuThreads} \
                {makehub} \
                --lazyremap \
                > {experimentname}.log 2>&1")
        # Move the log files to the trax directory
        os.system(f"mv mismatchcompare.txt {experimentname}/mismatchcompare.txt")
        os.system(f"mv positiondeletions.txt {experimentname}/positiondeletions.txt")
        os.system(f"mv positionmismatches.txt {experimentname}/positionmismatches.txt")
        os.system(f"mv Rlog-{experimentname}.txt {experimentname}/Rlog-{experimentname}.txt")
        os.system(f"mv Rplots.pdf {experimentname}/Rplots.pdf")
        # Move the experiment directory to the trax directory
        os.system(f"rm -rf trax/{experimentname}")
        os.system(f"mv {experimentname} trax/{experimentname}")
        # Move the log file to the trax directory
        os.system(f"mv {experimentname}.log trax/{experimentname}.log")
    else:
        print(f"{experimentname}.log already exists")
    print("Done.")

Perform it on the ARM-seq data

In [None]:
df_samples = pd.read_csv('config/samples.arm.tsv', sep='\t', header=None)
df_samples.columns = ['sample', 'group', 'fastq']
df_samples

In [None]:
df_pairs = pd.read_csv('config/pairs.arm.tsv', sep='\t', header=None)
df_pairs.columns = ['group1', 'group2']
df_pairs

In [None]:
run_trax('arm.smallncRNAs', 'config/samples.arm.tsv', 'config/pairs.arm.tsv', 'supplemental/hg38_small_ncRNAs.gtf', makehub=True)
run_trax('arm.tRNAsonly', 'config/samples.arm.tsv', 'config/pairs.arm.tsv', 'supplemental/empty.gtf')

# 4. Cleanup

- Move the BAM files to `rnaseq/bam` and the tRAX bigWig files to `rnaseq/bigwig`
  - If you need to regenerate tRAX files you should move the bam files back to the root of this project directory as they can take a long time to generate
- Generate the MD5 checksums for the processed files for GEO

In [None]:
# os.system("cp -r rnaseq/bam/* .")

In [None]:
print("Moving files to rnaseq directory...")
# Make directory for the RNAseq output
os.makedirs('rnaseq/bam', exist_ok=True)
os.system("rm -rf rnaseq/bigwig")
os.makedirs('rnaseq/bigwig', exist_ok=True)
# Move the BAM files to the rnaseq directory
os.system("mv *.bam rnaseq/bam/.")
os.system("mv *.bai rnaseq/bam/.")
# Move the bigWig files to the rnaseq directory
os.system("cp -r trax/arm.smallncRNAs/trackhub/*.bw rnaseq/bigwig/.")

In [None]:
print("Creating md5sums...")
os.system('md5sum rnaseq/fastq_raw/* > rnaseq/md5sums_raw.txt')
os.system('md5sum rnaseq/bigwig/* > rnaseq/md5sums_bigwig.txt')