## Genemark
```
cd ~/eckertlab/gypsy_indiv/raw_demult/analysis/maker 
> cat run_gmes.q

#$ -S /bin/sh
#$ -N gmes
#$ -V
#$ -cwd
#$ -pe smp 50
#$ -o gmes.out
#$ -e gmes.err
$HOME/g/perlbrew/perls/perl-5.16.3/bin/perl \
~/g/src/gm_et_linux_64/gmes_petap/gmes_petap.pl \
--v \
--ES \
--min_contig=10000 \
--cores 30 \
--sequence assembly.fasta
```

In [0]:
assembly = "/home/cfriedline/eckertlab/projects/gypsy_moth/assemblies/masurca3/CA/10-gapclose/genome.ctg.fasta"
analysis_dir = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/maker"

In [0]:
maker = "/gpfs_fs/home/cfriedline/src/maker/bin/maker"

In [0]:
from Bio import SeqIO
import sys
import os
import pandas as pd
import numpy as np
from subprocess import Popen, PIPE, STDOUT, check_call, check_output, run
from glob import glob
from ipyparallel import Client

In [0]:
with open(os.path.join(analysis_dir, "assembly_map.txt"), "w") as m:
    with open(os.path.join(analysis_dir, "assembly.fasta"), "w") as a:
        for i, rec in enumerate(SeqIO.parse(assembly, "fasta")):
            rec.description= ""
            rec.name = ""
            m.write("{}\t{}\n".format(i, rec.id))
            rec.id = "contig_{}".format(i)
            SeqIO.write(rec, a, "fasta")

In [0]:
cd $analysis_dir

In [0]:
def read_ctl(f):
    d = pd.read_csv(f, sep="=", comment="#", header=None)
    return d

In [0]:
ctl_data = {}
for ctl in ["maker_opts.ctl", "maker_bopts.ctl", "maker_exe.ctl"]:
    ctl_data[ctl] = read_ctl(ctl)

In [0]:
if not os.path.exists("split"):
    os.mkdir("split")

In [0]:
cd ..

In [0]:
for rec in SeqIO.parse("assembly.fasta", "fasta"):
    out = os.path.join("split", "{}.fasta".format(rec.name))
    if not os.path.exists(out):
        with open(out, "w") as o:
            SeqIO.write(rec, o, "fasta")
            fasta_path = os.path.abspath(o.name)

In [0]:
for key, val in ctl_data.items():
    out = os.path.join("split", key)
    val.to_csv(out, sep="=", header=None, index=None)

In [0]:
rc = Client(profile="sge")

In [0]:
lv = rc.load_balanced_view()
dv = rc[:]
len(dv)

In [0]:
with dv.sync_imports():
    import os, socket, sys
    from subprocess import Popen, PIPE, STDOUT, check_call, check_output, run, call

In [0]:
dv.apply_sync(os.chdir, "{}/split".format(analysis_dir));

In [0]:
os.chdir("split")

In [0]:
fasta_files = sorted(glob("contig*.fasta"))

In [0]:
fasta_files[0]

In [0]:
cmd = "bash run.sh \"{}\"".format(fasta_files[1])

In [0]:
from pprint import pprint
def run_cmd(cmd):
    p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
    out, err = p.communicate()
    return out, err
dv['run_cmd'] = run_cmd
cmd = "bash run.sh \"{}\"".format(fasta_files[345])
pprint(cmd)
#pprint(run_cmd(cmd))

In [0]:
!$maker -CTL

# MAKER

## Install

* installed in `/gpfs_fs/home/cfriedline/src/maker/bin`
* MPI enabled
* edited `locations` for `exonerate` and `augustus`  
* installed maker manually, setting `LIBRARY_PATH=/usr/lib/x86_64-redhat-linux5E/lib64/`

`Note`: nightmare configuring `MPI` and `PSM`. Installed `PSM` from `https://github.com/01org/psm` and `OpenMPI` 1.10.2.

---

## Config files

### maker_opts.ctl
```
#-----Genome (these are always required)
#genome=/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/maker/genome.ctg.fasta #(fasta file or fasta embeded in GFF3 file)
genome=/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/maker/assembly.fasta
organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic

#-----Re-annotation Using MAKER Derived GFF3
maker_gff= #MAKER derived GFF3 file
est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no

#-----EST Evidence (for best results provide a file for at least one)
est= #set of ESTs or assembled mRNA-seq in fasta format
altest=/home/cfriedline/eckertlab/genomes/Hmel2/annotation/Hmel2_cds.fa #EST/cDNA sequence file in fasta format from an alternate organism
est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file
altest_gff= #aligned ESTs from a closly relate species in GFF3 format

#-----Protein Homology Evidence (for best results provide a file for at least one)
protein=/home/cfriedline/eckertlab/genomes/Hmel2/annotation/Hmel2_proteins.fa  #protein sequence file in fasta format (i.e. from mutiple oransisms)
protein_gff=  #aligned protein homology evidence from an external GFF3 file

#-----Repeat Masking (leave values blank to skip repeat masking)
model_org=all #select a model organism for RepBase masking in RepeatMasker
rmlib= #provide an organism specific repeat library in fasta format for RepeatMasker
repeat_protein=/gpfs_fs/home/cfriedline/src/maker/data/te_proteins.fasta #provide a fasta file of transposable element proteins for RepeatRunner
rm_gff= #pre-identified repeat elements from an external GFF3 file
prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)

#-----Gene Prediction
snaphmm= #SNAP HMM file
gmhmm= #GeneMark HMM file
augustus_species=heliconius_melpomene1 #Augustus gene prediction species model
fgenesh_par_file= #FGENESH parameter file
pred_gff= #ab-initio predictions from an external GFF3 file
model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
est2genome=1 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
trna=1 #find tRNAs with tRNAscan, 1 = yes, 0 = no
snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no

#-----Other Annotation Feature Types (features MAKER doesn't recognize)
other_gff= #extra features to pass-through to final MAKER generated GFF3 file

#-----External Application Behavior Options
alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
cpus=1 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)

#-----MAKER Behavior Options
max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
min_contig=1 #skip genome contigs below this length (under 10kb are often useless)

pred_flank=200 #flank for extending evidence clusters sent to gene predictors
pred_stats=0 #report AED and QI statistics for all predictions as well as models
AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
min_protein=0 #require at least this many amino acids in predicted proteins
alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
keep_preds=0 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)

split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes

tries=5 #number of times to try a contig if there is a failure for some reason
clean_try=1 #remove all data from previous run before retrying, 1 = yes, 0 = no
clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
TMP=/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/maker/tmp #specify a directory other than the system default temporary directory for temporary files
```

### maker_exe.ctl

```
#-----Location of Executables Used by MAKER/EVALUATOR
makeblastdb=/gpfs_fs/home/cfriedline/src/maker/bin/../exe/blast/bin/makeblastdb #location of NCBI+ makeblastdb executable
blastn=/gpfs_fs/home/cfriedline/src/maker/bin/../exe/blast/bin/blastn #location of NCBI+ blastn executable
blastx=/gpfs_fs/home/cfriedline/src/maker/bin/../exe/blast/bin/blastx #location of NCBI+ blastx executable
tblastx=/gpfs_fs/home/cfriedline/src/maker/bin/../exe/blast/bin/tblastx #location of NCBI+ tblastx executable
formatdb= #location of NCBI formatdb executable
blastall= #location of NCBI blastall executable
xdformat= #location of WUBLAST xdformat executable
blasta= #location of WUBLAST blasta executable
RepeatMasker=/gpfs_fs/home/cfriedline/src/maker/bin/../exe/RepeatMasker/RepeatMasker #location of RepeatMasker executable
exonerate=/gpfs_fs/home/cfriedline/src/maker/bin/../exe/exonerate/bin/exonerate #location of exonerate executable

#-----Ab-initio Gene Prediction Algorithms
snap=/gpfs_fs/home/cfriedline/src/maker/bin/../exe/snap/snap #location of snap executable
gmhmme3=/gpfs_fs/home/cfriedline/src/gm_et_linux_64/gmes_petap/gmhmme3 #location of eukaryotic genemark executable
gmhmmp= #location of prokaryotic genemark executable
augustus=/home/cfriedline/g/src/augustus-3.2.2/bin/augustus #location of augustus executable
fgenesh= #location of fgenesh executable
tRNAscan-SE=/home/cfriedline/g/src/tRNAscan-SE-1.3.1/tRNAscan-SE #location of trnascan executable
snoscan= #location of snoscan executable

#-----Other Algorithms
probuild=/gpfs_fs/home/cfriedline/g/src/gm_et_linux_64/gmes_petap/probuild #location of probuild executable (required for genemark)
```

## maker_bopts.ctl

```
#-----BLAST and Exonerate Statistics Thresholds
blast_type=ncbi+ #set to 'ncbi+', 'ncbi' or 'wublast'

pcov_blastn=0.8 #Blastn Percent Coverage Threhold EST-Genome Alignments
pid_blastn=0.85 #Blastn Percent Identity Threshold EST-Genome Aligments
eval_blastn=1e-10 #Blastn eval cutoff
bit_blastn=40 #Blastn bit cutoff
depth_blastn=0 #Blastn depth cutoff (0 to disable cutoff)

pcov_blastx=0.5 #Blastx Percent Coverage Threhold Protein-Genome Alignments
pid_blastx=0.4 #Blastx Percent Identity Threshold Protein-Genome Aligments
eval_blastx=1e-06 #Blastx eval cutoff
bit_blastx=30 #Blastx bit cutoff
depth_blastx=0 #Blastx depth cutoff (0 to disable cutoff)

pcov_tblastx=0.8 #tBlastx Percent Coverage Threhold alt-EST-Genome Alignments
pid_tblastx=0.85 #tBlastx Percent Identity Threshold alt-EST-Genome Aligments
eval_tblastx=1e-10 #tBlastx eval cutoff
bit_tblastx=40 #tBlastx bit cutoff
depth_tblastx=0 #tBlastx depth cutoff (0 to disable cutoff)

pcov_rm_blastx=0.5 #Blastx Percent Coverage Threhold For Transposable Element Masking
pid_rm_blastx=0.4 #Blastx Percent Identity Threshold For Transposbale Element Masking
eval_rm_blastx=1e-06 #Blastx eval cutoff for transposable element masking
bit_rm_blastx=30 #Blastx bit cutoff for transposable element masking

ep_score_limit=20 #Exonerate protein percent of maximal score threshold
en_score_limit=20 #Exonerate nucleotide percent of maximal score threshold

```

---

## Execute

### run.q
```
#$ -S /bin/sh
#$ -N maker
#$ -V
#$ -cwd
#$ -pe jgl_ompi 185
#$ -o maker.out
#$ -e maker.err
export LD_PRELOAD=$HOME/g/opt/mpi/1.10.2/lib/libmpi.so
export LD_LIBRARY_PATH=$HOME/g/opt/boost159/lib:$HOME/g/opt/mpi/1.10.2/lib:/home/cfriedline/g/opt/mpi/1.10.2/lib/openmpi:$HOME/g/opt/psm/usr/lib64:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/home/cfriedline/g/gcc54/lib64:$LD_LIBRARY_PATH
export ZOE=$HOME/g/src/maker/exe/snap/Zoe
export AUGUSTUS_CONFIG_PATH=/home/cfriedline/g/src/augustus-3.2.2/config
export OMPI_MCA_mpi_warn_on_fork=0
export PERL5LIB=/gpfs_fs/home/cfriedline/src/tRNAscan-SE-1.3.1:$PERL5LIB
export PATH=/gpfs_fs/home/cfriedline/src/tRNAscan-SE-1.3.1:$PATH

$HOME/g/opt/mpi/1.10.2/bin/mpiexec \
-x LD_PRELOAD \
-x ZOE \
-x AUGUSTUS_CONFIG_PATH \
-x LD_LIBRARY_PATH \
-x PATH \
-n $NSLOTS \
--mca orte_base_help_aggregate 0 \
--mca btl tcp,self \
$HOME/g/src/maker/bin/maker
```

Execute: `qsub run.q`

---

## Iterations

1. only ab init predictions, trna; no altest or protein, est2genome=0
1. add altest and protein, change est2genome=1
1. run genemark: `perl ~/g/src/gm_et_linux_64/gmes_petap/gmes_petap.pl --v --ES --min_contig=10000 --sequence assembly.fasta`
1. change est2genome to `0`, add gmhmm.mod to `maker_opts.ctl`
---

## collect gff files

```
    cd ~/eckertlab/gypsy_indiv/raw_demult/analysis/maker && find assembly.maker.output -name "*.gff" | grep -v Void > gff_files
    cd assembly.maker.output
    ~/g/src/maker/bin/gff_merge -d assembly_master_datastore_index.log
    ~/g/src/maker/bin/fasta_merge -d assembly_master_datastore_index.log
```

In [0]:
pwd

In [0]:
gff_files = open("gff_files").readlines()

In [0]:
gff_files = [os.path.abspath(x.strip()) for x in gff_files]

In [0]:
with open("/home/cfriedline/g/src/Apollo-2.0.4/m2jb.jobs", "w") as o:
    for g in gff_files:
        o.write("bin/maker2jbrowse {} -o /home/cfriedline/g/opt/apollo/gypsy_moth --no_names_index\n".format(g))

### run on godel:
```
qrsh -pe smp 60 -N m2jb
cd /home/cfriedline/g/src/Apollo-2.0.4/
parallel --bar -j 60 -a m2jb.jobs

bin/generate-names.pl /home/cfriedline/g/opt/apollo/gypsy_moth

groovy /home/cfriedline/g/src/Apollo-2.0.4/docs/web_services/examples/groovy/add_organism.groovy -name gypsy_moth -url http://localhost:8080/apollo -directory /home/cfriedline/g/opt/apollo/gypsy_moth -username cfriedline@vcu.edu -password smithers -public
```



In [0]:
!mkdir /home/cfriedline/g/opt/apollo/gypsy_moth_snps

In [0]:
def read_df(dirname, fname):
    f = os.path.join(dirname, "%s.txt" % fname)
    return pd.read_csv(f, sep="\t", index_col=0)

In [0]:
z12_swapped = read_df("/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/ni", "z12_swapped")

In [0]:
contigs_with_snps = set([x.split("_")[0] for x in z12_swapped.columns])

In [0]:
assembly_map = pd.read_csv(os.path.join(analysis_dir, "assembly_map.txt"), sep="\t", header=None, names=['idx', 'name'])

In [0]:
assembly_map['contig'] = assembly_map['idx'].apply(lambda x: "contig_{}".format(x))
assembly_map.index = assembly_map.contig

In [0]:
assembly_dict = assembly_map['name'].to_dict()

In [0]:
len(contigs_with_snps)

In [0]:
with open("/home/cfriedline/g/src/Apollo-2.0.4/m2jb_snps.jobs", "w") as o:
    found = 0
    for g in gff_files:
        contig = os.path.basename(os.path.dirname(g))
        seq = assembly_dict[contig]
        if seq in contigs_with_snps:
            found += 1
            if found % 1000 == 0: 
                print(found)
            o.write("bin/maker2jbrowse {} -o /home/cfriedline/g/opt/apollo/gypsy_moth_snp_contigs --no_names_index\n".format(g))

In [0]:
!wc -l /home/cfriedline/g/src/Apollo-2.0.4/m2jb_snps.jobs

### run on godel

```
qrsh -pe smp 60 -N m2jb
cd /home/cfriedline/g/src/Apollo-2.0.4/
parallel --bar -j 60 -a m2jb_snps.jobs

bin/generate-names.pl /home/cfriedline/g/opt/apollo/gypsy_moth_snp_contigs

groovy /home/cfriedline/g/src/Apollo-2.0.4/docs/web_services/examples/groovy/add_organism.groovy -name gypsy_moth_snp_contigs -url http://localhost:8080/apollo -directory /home/cfriedline/g/opt/apollo/gypsy_moth_snp_contigs -username cfriedline@vcu.edu -password smithers -public
```

In [0]:
snp_gff = {}
keep_contigs = set()
for g in gff_files:
    contig = os.path.basename(os.path.dirname(g))
    seq = assembly_dict[contig]
    if seq in contigs_with_snps:
        snp_gff[seq] = g
        keep_contigs.add(contig)

In [0]:
import gffutils

In [0]:
db = gffutils.create_db("assembly.maker.output/assembly.all.gff", "gff.db")

In [0]:
found = {}
for feat in db.features_of_type("match"):
    if 'augustus' in feat.source and feat.chrom in keep_contigs:
        if feat.chrom not in found:
            found[feat.chrom] = []
        found[feat.chrom].append(feat)

In [0]:
multiple_genes = 0
for x in found:
    if len(found[x]) > 1:
        multiple_genes += 1
        for elem in found[x]:
            print(elem.astuple())

In [0]:
multiple_genes