In [1]:
! gtf2gff3.pl



Synopsis:

gtf2gff3 --cfg gtf2gff3_MY_CONFIG.cfg gtf_file > gff3_file

gtf2gff3 --help # for a more detailed help message.

Description:

This script will convert GTF formatted files to valid GFF3 formatted
files.  It will map the column 3 ("type" column) to valid SO, but
because any non standard term may appear in that column in GTF files,
you may edit the config file to provide your own GTF feature to SO
mapping.  The script will also build gene models from exons, CDSs and
other features given in the GTF file.  It is currently tested on Ensemble
and Twinscan GTF, and it should work on any other files that follow the
same specification.  It does not work on GTF from the UCSC table browser
because those files use the same ID for gene and transcript, so it is
impossible to group multiple transcripts to a gene.  See the README that
came with the script for more info.

Options:

  --cfg   Provide the filename for a config file.  See the configuration
	  file pro

In [None]:
! gtf2gff3.pl /projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.exon.gff3

GFF3 from Gencode is not proper GFF3 spec --> Need to change `gene_name` to `NAME` everywehre.

In [5]:
gff3_original = '/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.exon.gff3'
gff3_proper = '/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.exon.gff3_proper'

In [6]:
! sed s/gene_name/NAME/g < $gff3_original > $gff3_proper

In [7]:
import os

def write_majiq_config(filename, read_length, bam_path, genome, genome_fasta_path, 
                       experiments, experiment_type='strand-specific'):
    """Create a configuration file for running MAJIQ
    
    Parameters
    ----------
    filename : str
        Name of  the config file to write
    read_length : int
        Length of the reads
    bam_path : str
        Location of the bam files
    genome : str
        Genome build, e.g. 'hg19' or 'mm10'
    fasta_path : str
        Location of the genome fasta files
    experiments : dict
        A mapping of {group_name: [bam1, bam2, bam3, ...]} of the group names and the bam filenames in `sam_dir`
    experiment_type : 'strand-specific' | None, optional
        If using strand-specific RNA-seq data, specify this, otherwise say None
        
    """
    genome_fasta_path = os.path.abspath(os.path.expanduser(genome_fasta_path))
    bam_path = os.path.abspath(os.path.expanduser(bam_path))
    experiments_str = '\n'.join('{0}={1}'.format(group, ','.join(files)) for group, files in experiments.items())
    s = '''
[info]
readlen={0}
samdir={1}
genome={2}
genome_path={3}
type={4}

[experiments]
{5}
'''.format(read_length, bam_path, genome, genome_fasta_path, experiment_type, experiments_str)
    with open(filename, 'w') as f:
        f.write(s)

In [8]:
ls /home/obotvinnik/projects/singlecell_pnms/analysis/all_bams/

[0m[01;34m$PWD[0m/
1-51.py
[01;36mCVN_01_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam[0m@
CVN_01_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam.bai
CVN_01_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam.miso.sh
[01;36mCVN_02_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam[0m@
CVN_02_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam.bai
CVN_02_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam.miso.sh
[01;36mCVN_03_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam[0m@
CVN_03_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam.bai
CVN_03_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam.miso.sh
[01;36mCVN_04_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam[0m@
CVN_04_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam.bai
CVN_04_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam.miso.sh
[01;36mCVN_05_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg.bam[0m@
CVN_05_R1.fastq.gz.polyATrim.adapterTrim.rmRep.so

In [9]:
import pandas as pd

bams = pd.read_csv('/home/obotvinnik/projects/singlecell_pnms/analysis/all_bams/all_valid_bams_with_stressed.csv')
bams.head()

Unnamed: 0,bam,sample_id
0,CVN_01_R1.fastq.gz.polyATrim.adapterTrim.rmRep...,CVN_01
1,CVN_02_R1.fastq.gz.polyATrim.adapterTrim.rmRep...,CVN_02
2,CVN_03_R1.fastq.gz.polyATrim.adapterTrim.rmRep...,CVN_03
3,CVN_04_R1.fastq.gz.polyATrim.adapterTrim.rmRep...,CVN_04
4,CVN_05_R1.fastq.gz.polyATrim.adapterTrim.rmRep...,CVN_05


In [10]:
bams = bams.set_index('sample_id').bam
bams.head()

sample_id
CVN_01    CVN_01_R1.fastq.gz.polyATrim.adapterTrim.rmRep...
CVN_02    CVN_02_R1.fastq.gz.polyATrim.adapterTrim.rmRep...
CVN_03    CVN_03_R1.fastq.gz.polyATrim.adapterTrim.rmRep...
CVN_04    CVN_04_R1.fastq.gz.polyATrim.adapterTrim.rmRep...
CVN_05    CVN_05_R1.fastq.gz.polyATrim.adapterTrim.rmRep...
Name: bam, dtype: object

In [11]:
metadata = pd.read_csv('/home/obotvinnik/flotilla_projects/singlecell_pnms_post_kmeans_outliers/metadata.csv.gz', index_col=0)
metadata.head()

Unnamed: 0,Notes,craig_venter,differentiation_batch,neuron,outlier,phenotype,pooled,progenitor,single,split,stressed
CVN_01,,True,,False,False,NPC,False,True,True,,False
CVN_02,,True,,False,False,NPC,False,True,True,,False
CVN_03,,True,,False,False,NPC,False,True,True,,False
CVN_04,,True,,False,False,NPC,False,True,True,,False
CVN_05,,True,,False,False,NPC,False,True,True,,False


In [12]:
experiments = dict((group, df.map(lambda x: x.split('.bam')[0])) for group, df in bams.groupby(metadata.phenotype))
experiments

{'MN': sample_id
 M1_01      M1_01_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_02      M1_02_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_03      M1_03_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_04      M1_04_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_05      M1_05_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_06      M1_06_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_07      M1_07_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_08      M1_08_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_09      M1_09_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_10      M1_10_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_11      M1_11_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M1_12      M1_12_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M2_01      M2_01_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M2_02      M2_02_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M2_03      M2_03_R1.fastq.gz.polyATrim.adapterTrim.rmRep....
 M2_04      M2_04_R1.fastq.gz.polyATrim.adapterTrim.r

In [13]:
configuration = '/home/obotvinnik/projects/singlecell_pnms/analysis/all_bams/majiq_config.txt'
read_length = 92
bam_path = '/home/obotvinnik/projects/singlecell_pnms/analysis/all_bams/'
genome = 'hg19'
genome_fasta_path = '/projects/ps-yeolab/genomes/hg19/chromosomes/'
experiment_type = None
write_majiq_config(configuration, read_length, bam_path, genome, genome_fasta_path, experiments, experiment_type)

In [14]:
cat $configuration


[info]
readlen=92
samdir=/home/obotvinnik/projects/singlecell_pnms/analysis/all_bams
genome=hg19
genome_path=/projects/ps-yeolab/genomes/hg19/chromosomes
type=None

[experiments]
iPSC=M2nd_33_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,M2nd_34_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_01_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_02_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_03_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_04_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_05_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_06_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_07_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_08_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_09_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_10_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_11_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P1_12_R1.fastq.gz.polyATrim.adapterTrim.rmRep.sorted.rg,P2_01_R1.fastq.gz.

In [15]:
! majiq

usage: majiq [-h] [-v] {build,psi,deltapsi} ...
majiq: error: too few arguments


In [16]:
! majiq build $gff3_proper -conf $configuration --nthreads 2 --output /home/obotvinnik/projects/singlecell_pnms/analysis/all_bams/

2015-09-08 18:11:00,949 (PID:51508) - INFO - 
2015-09-08 18:11:00,949 (PID:51508) - INFO - Command: Namespace(conf='/home/obotvinnik/projects/singlecell_pnms/analysis/all_bams/majiq_config.txt', debug=0, func=<function builder at 0x2b7512294578>, gff_output='lsvs.gff', logger=None, min_intronic_cov=0.5, minpos=2, minreads=3, non_denovo=False, nthreads=2, only_rna=False, onlygather=False, output='/home/obotvinnik/projects/singlecell_pnms/analysis/all_bams/', pcr_filename=None, permissive=False, plotpath=None, prefix='', silent=False, tmp='/tmp/', transcripts='/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.exon.gff3_proper')
2015-09-08 18:11:00,949 (PID:51508) - INFO - ... waiting gff3 parsing
START child, Process-1
2015-09-08 18:11:00,960 (PID:51509) - INFO - Error, incorrect gff. exon exon:ENST00000456328.2:1 doesn't have valid mRNA ENST00000456328.2
2015-09-08 18:11:00,961 (PID:51509) - INFO - Error, incorrect gff. exon exon:ENST00000456328.2:2 doesn't have valid 

KeyboardInterrupt: 