# Gnocis - Figures for manuscript
Bjørn André Bredesen, 2020

## Data sources
 * ModENCODE: http://www.modencode.org/
    - Citation: https://www.nature.com/articles/459927a
 * Ensembl: ftp://ftp.ensembl.org/pub/release-100/gtf/drosophila_melanogaster/
    - Citation: https://doi.org/10.1093/nar/gkx1098
 * FlyBase: https://flybase.org/
    - Citation: https://academic.oup.com/nar/article/41/D1/D751/1051942
 * Kahn *et al.* 2014: https://doi.org/10.1371/journal.pgen.1004495.s010
    - Citation: https://doi.org/10.1371/journal.pgen.1004495

### Data preparation and preliminary analysis

In [1]:
import gnocis as nc # Load Gnocis
nc.setSeed(12345) # Set random seed
nc.setNCores(12) # Enable multi-core processing

In [2]:
Dmel = nc.genome('D. melanogaster R5')\
    .streamFASTAGZ('DmelR5.fasta.gz')\
    .loadEnsemblAnnotationGTFGZ('./ensembl_Drosophila_melanogaster.BDGP5.77.gtf.gz')

In [None]:
# We exclude PREs within 100kb of the invected gene for validation
invectedProximal = Dmel.gene('inv').region.singleton().recenter(100000)

In [None]:
# Load Kahn et al. PREs
Kahn2014Rgn = nc.loadGFF('Kahn2014.GFF')
# Extract PRE sequences from genome
Kahn2014Seq = Kahn2014Rgn.nonOverlap(invectedProximal).recenter(3000).extract(Dmel)

In [None]:
# Generate dummy PREs
MCPRE = nc.MarkovChain(trainingSequences = Kahn2014Seq, degree = 4)
DummyPREs = MCPRE.generateSet(n = 100 * len(Kahn2014Seq), length = len(Kahn2014Seq[0]))

In [None]:
# Generate dummy genomic
MCGenome = nc.MarkovChain(trainingSequences = Dmel, degree = 4)
DummyGenomic = MCGenome.generateSet(n = 100 * len(Kahn2014Seq), length = len(Kahn2014Seq[0]))

In [None]:
# Extract CDS, concatenate, and get non-overlapping 3kb fragments
CDSSeq = Dmel.getCDS().flatten().extract(Dmel)
CDSFragments = nc.sequence('CDS', ''.join(s.seq for s in CDSSeq)).windows(3000, 3000)

In [None]:
# Load PcG biomarkers
PcG = nc.biomarkers('PcG', [
    nc.loadGFFGZ('Pc.gff3.gz').deltaResize(1000),
    nc.loadGFFGZ('Psc.gff3.gz').deltaResize(1000),
    nc.loadGFFGZ('dRING.gff3.gz').deltaResize(1000),
    nc.loadGFFGZ('H3K27me3.gff3.gz'),
])

In [None]:
gwWin = Dmel.windowRegions(size = 1000, step = 100)

In [None]:
# Extract highly biomarker-enriched (HBME) regions
HBMERgn = PcG.HBMEs(gwWin, threshold = 3)

In [None]:
# Extract lowly biomarker-enriched (LBME) regions
LBMERgn = PcG.LBMEs(gwWin, threshold = 1).filter('', lambda r: len(r) >= 3000)
LBMESeq = LBMERgn.extract(Dmel).windows(3000, 3000)

In [None]:
LBMESeq.printStatistics()

In [None]:
LBMERgn.overlap(Kahn2014Rgn)

In [None]:
Kahn2014Rgn

In [None]:
motifs = nc.motifs('Ringrose et al. 2003 + GTGT', [
    nc.IUPACMotif('En', 'GSNMACGCCCC', 1),
    nc.IUPACMotif('G10', 'GAGAGAGAGA', 1),
    nc.IUPACMotif('GAF', 'GAGAG', 0),
    nc.IUPACMotif('PF', 'GCCATHWY', 0),
    nc.IUPACMotif('PM', 'CNGCCATNDNND', 0),
    nc.IUPACMotif('PS', 'GCCAT', 0),
    nc.IUPACMotif('Z', 'YGAGYG', 0),
    nc.IUPACMotif('GTGT', 'GTGT', 0)
])

In [None]:
# Construct a distance-thresholded motif pair feature set, and
# output enrichment statistics
motifs.pairFreq(220) \
    .diffsummary(Kahn2014Seq, DummyPREs) \
    .sort('KLD(A||B)', ascending = False)

In [None]:
# Construct a distance-thresholded motif pair feature set, and
# output enrichment statistics
motifs.pairFreq(220) \
    .summary(Kahn2014Seq) \
    .sort('Mean', ascending = False)

In [None]:
# Construct a distance-thresholded motif pair feature set, and
# output enrichment statistics
motifs.pairFreq(220).filter([ 0, 1, 2, 3 ]) \
    .summary(Kahn2014Seq) \
    .sort('Mean', ascending = False)

### Cross-validation

In [None]:
# We want to use three classes of negatives for training, so we add labels for these
labelDummyGenomic = nc.sequenceLabel('Dummy genomic', -1)
labelDummyPRE = nc.sequenceLabel('Dummy PRE', -2)
labelCDS = nc.sequenceLabel('CDS', -3)
# We use these labels to construct our training set, to enable multiclass learning
trainingSet = Kahn2014Seq.label(nc.positive)\
    + DummyGenomic.label(labelDummyGenomic)\
    + DummyPREs.label(labelDummyPRE)\
    + CDSFragments.label(labelCDS)

In [None]:
# PyPREdictor
motifs = nc.motifs('Ringrose et al. 2003 + GTGT', [
    nc.IUPACMotif('En', 'GSNMACGCCCC', 1),
    nc.IUPACMotif('G10', 'GAGAGAGAGA', 1),
    nc.IUPACMotif('GAF', 'GAGAG', 0),
    nc.IUPACMotif('PF', 'GCCATHWY', 0),
    nc.IUPACMotif('PM', 'CNGCCATNDNND', 0),
    nc.IUPACMotif('PS', 'GCCAT', 0),
    nc.IUPACMotif('Z', 'YGAGYG', 0),
    nc.IUPACMotif('GTGT', 'GTGT', 0)
])
PyPREdictor = motifs\
        .pairFreq(distCut = 219)\
        .model(nc.logOdds(labelPositive = nc.positive, labelNegative = labelDummyPRE))\
        .sequenceModel(name = 'PyPREdictor (M2003+GTGT)',
                       windowSize = 500, windowStep = 250)

# Quadratic 5-spectrum kernel SVM
import gnocis.sklearnCUDAModels as sklcunc
spectrumSVM = nc.kSpectrum(5)\
      .window(size = 500, step = 250)\
      .scale()\
      .model(sklcunc.CUDASVM(kDegree = 2, C = 1, labelPositive = nc.positive, labelNegative = labelDummyPRE))\
      .sequenceModel(name = 'SVM (quadratic, 5-spectrum)')

# Quadratic 5-spectrum mismatch kernel SVM
mismatchSVM = nc.kSpectrumMM(5)\
      .window(size = 500, step = 250)\
      .scale()\
      .model(sklcunc.CUDASVM(kDegree = 2, C = 1, labelPositive = nc.positive, labelNegative = labelDummyPRE))\
      .sequenceModel(name = 'SVM (quadratic, 5-spectrum mismatch)')

# CNN
import gnocis.tfModels as tfnc
tfnc.setSeed(12345)
CNN = tfnc.sequenceModelMultiCNN(
            name = 'Convolutional Neural Network',
            windowSize = 500, windowStep = 250,
            nConv = 15, convLen = 10,
            epochs = 250,
            targetLabel = nc.positive,
            labels = [ nc.positive, labelDummyPRE, labelDummyGenomic, labelCDS ] )

In [None]:
# Optional: SVM-MOCCA - requires separate installation
from gnocis.MOCCAModels import sequenceModelSVMMOCCA
SVMMOCCA = sequenceModelSVMMOCCA('SVM-MOCCA', MOCCAPath = 'INSERT_MOCCA_PATH_HERE',
    motifs = motifs, windowSize = 3000, windowStep = 1000, kDegree = 2,
    labelsPositive = [ nc.positive ], labelsNegative = [ labelDummyGenomic, labelDummyPRE, labelCDS ])

#### PREs versus dummy-PREs

In [None]:
# Cross-validate: PREs versus dummy-PREs
cvPREvsDummyPRE = nc.crossvalidate([
        PyPREdictor,
        spectrumSVM,
        mismatchSVM,
        CNN,
        SVMMOCCA
    ],
    trainingSet = trainingSet,
    validationSet = Kahn2014Seq.label(nc.positive) + DummyPREs.label(nc.negative),
    ratioTrainTest = 0.6875, # This ratio is chosen in order to give the same ratio as in Bredesen et al. 2019,
                             # for generalization comparable with Figure 1C
    ratioNegPos = 100.)
cvPREvsDummyPRE

In [None]:
cvPREvsDummyPRE.plotPRC(outpath = './fig3APRCPREsVsDummyPREs.pdf')

#### PREs versus coding sequences

In [None]:
# Cross-validate
valSet = Kahn2014Seq.label(nc.positive) + CDSFragments.label(nc.negative)
cvPREvsCDS = nc.crossvalidate(cvPREvsDummyPRE.models,
    trainingSet = trainingSet,
    validationSet = valSet,
    ratioTrainTest = 0.6875) # This ratio is chosen in order to give the same ratio as in Bredesen et al. 2019,
                             # for generalization comparable with Figure 1C
cvPREvsCDS

In [None]:
cvPREvsCDS.plotPRC(outpath = './fig3BPRCPREsVsCDS.pdf')

#### PREs versus dummy genomic

In [None]:
# Optional
# Cross-validate
valSet = Kahn2014Seq.label(nc.positive) + DummyGenomic.label(nc.negative)
cvPREvsDummyGenomic = nc.crossvalidate(cvPREvsDummyPRE.models,
    trainingSet = trainingSet,
    validationSet = valSet,
    ratioTrainTest = 0.6875) # This ratio is chosen in order to give the same ratio as in Bredesen et al. 2019,
                             # for generalization comparable with Figure 1C
cvPREvsDummyGenomic

In [None]:
cvPREvsDummyGenomic.plotPRC(outpath = './figExtraPRCPREsVsDummyGenomic.pdf')

### SVM runtime benchmarks

In [None]:
import gnocis.sklearnModels as sklnc
import time
import datetime

mismatchSVM_SingleCPU = nc.kSpectrumMM(5)\
      .window(size = 500, step = 250)\
      .scale()\
      .model(sklnc.SVM(kDegree = 2, C = 1, labelPositive = nc.positive, labelNegative = labelDummyPRE))\
      .sequenceModel(name = 'Single')
mismatchSVM_SingleCPU = mismatchSVM_SingleCPU.getTrainer()(cvPREvsDummyGenomic.cvtrain[0])
mismatchSVM_SingleCPU.enableMultiprocessing = False

mismatchSVM_MultiCPU = nc.kSpectrumMM(5)\
      .window(size = 500, step = 250)\
      .scale()\
      .model(sklnc.SVM(kDegree = 2, C = 1, labelPositive = nc.positive, labelNegative = labelDummyPRE))\
      .sequenceModel(name = 'Multiprocessing')
mismatchSVM_MultiCPU = mismatchSVM_MultiCPU.getTrainer()(cvPREvsDummyGenomic.cvtrain[0])

mismatchSVM_CUDA = nc.kSpectrumMM(5)\
      .window(size = 500, step = 250)\
      .scale()\
      .model(sklcunc.CUDASVM(kDegree = 2, C = 1, labelPositive = nc.positive, labelNegative = labelDummyPRE))\
      .sequenceModel(name = 'CUDA')
mismatchSVM_CUDA = mismatchSVM_CUDA.getTrainer()(cvPREvsDummyGenomic.cvtrain[0])

def timerun(func):
    t0 = time.time()
    func()
    return str(datetime.timedelta(seconds = time.time() - t0))

nc.nctable('Running times', [
    {
        'Name': model.name,
        'Time': str(timerun( lambda: model.getSequenceScores(DummyGenomic) ))
    }
    for model in [
        mismatchSVM_SingleCPU,
        mismatchSVM_MultiCPU,
        mismatchSVM_CUDA,
    ]
])

In [None]:
len(DummyGenomic)

### Genome-wide prediction

In [None]:
rep = 0
trainedModels = [
    mdl.getTrainer()(cvPREvsDummyPRE.cvtrain[rep])
    for mdl in cvPREvsDummyPRE.models
    if mdl.name != 'SVM-MOCCA' # Exclude SVM-MOCCA, for which we use core-PRE prediction
]

In [None]:
vpos = cvPREvsDummyPRE.cvval[rep].withLabel(nc.positive)
for mdl in trainedModels:
    print('Calibrating model: %s'%mdl.name)
    mdl.calibrateGenomewidePrecision(positives = vpos,
                                    genome = Dmel,
                                    factor = 1. - 0.6875,
                                    precision = 0.8,
                                    bgModelOrder = 4)

In [None]:
modelPredictions = []

for mdl in trainedModels:
    print('Predicting with model: %s'%mdl.name)
    modelPredictions.append( mdl.predict(Dmel.sequences) )

In [None]:
# Optional: Add core-PRE predictions made by SVM-MOCCA
SVMMOCCA = cvPREvsDummyPRE.models[-1].getTrainer()(cvPREvsDummyPRE.cvtrain[rep])
modelPredictions += [
    SVMMOCCA.predictCore(
        vpos = vpos,
        genome = Dmel,
        factor = 1. - 0.6875,
        precision = 0.8,
        bgModelOrder = 4
    )
]

In [None]:
modelPredictions += [
    nc.loadGFF(SVMMOCCA.tmpPath + 'pred.gff')
]

In [None]:
nc.overlapSensitivityBarplot(regionSets = [
        HBMERgn.rename('HBME'),
        Kahn2014Rgn.rename('Kahn 2014'),
    ], predictionSets = [
        pred.regions().rename(mdl.name)
        if mdl.name != 'SVM-MOCCA' else pred.rename(mdl.name)
        for mdl, pred in zip(cvPREvsDummyPRE.models, modelPredictions)
    ])

In [None]:
nc.overlapSensitivityBarplot(regionSets = [
        HBMERgn.rename('HBME'),
        Kahn2014Rgn.rename('Kahn 2014'),
    ], predictionSets = [
        pred.regions().rename(mdl.name)
        if mdl.name != 'SVM-MOCCA' else pred.rename(mdl.name)
        for mdl, pred in zip(cvPREvsDummyPRE.models, modelPredictions)
    ],
    outpath = "fig4AOverlapSensitivity.pdf")

In [None]:
nc.nucleotidePrecisionBarplot(regionSets = [
        HBMERgn.deltaResize(1000).flatten().rename('HBME'),
        Kahn2014Rgn.deltaResize(1000).flatten().rename('Kahn 2014'),
    ], predictionSets = [
        pred.regions().rename(mdl.name)
        if mdl.name != 'SVM-MOCCA' else pred.rename(mdl.name)
        for mdl, pred in zip(cvPREvsDummyPRE.models, modelPredictions)
    ])

In [None]:
nc.nucleotidePrecisionBarplot(regionSets = [
        HBMERgn.deltaResize(1000).flatten().rename('HBME'),
        Kahn2014Rgn.deltaResize(1000).flatten().rename('Kahn 2014'),
    ], predictionSets = [
        pred.regions().rename(mdl.name)
        if mdl.name != 'SVM-MOCCA' else pred.rename(mdl.name)
        for mdl, pred in zip(cvPREvsDummyPRE.models, modelPredictions)
    ],
    outpath = "fig4BNucleotidePrecision.pdf")

In [None]:
nc.overlapPrecisionBarplot(regionSets = [
        HBMERgn.rename('HBME'),
        Kahn2014Rgn.rename('Kahn 2014'),
    ], predictionSets = [
        pred.regions().rename(mdl.name)
        if mdl.name != 'SVM-MOCCA' else pred.rename(mdl.name)
        for mdl, pred in zip(cvPREvsDummyPRE.models, modelPredictions)
    ])

In [None]:
nc.overlapPrecisionBarplot(regionSets = [
        HBMERgn.rename('HBME'),
        Kahn2014Rgn.rename('Kahn 2014'),
    ], predictionSets = [
        pred.regions().rename(mdl.name)
        if mdl.name != 'SVM-MOCCA' else pred.rename(mdl.name)
        for mdl, pred in zip(cvPREvsDummyPRE.models, modelPredictions)
    ],
    outpath = "figExtraOverlapPrecision.pdf")

In [None]:
nc.plotGenomeTracks(
    [
        Dmel,
        HBMERgn.rename('HBME'),
        Kahn2014Rgn.rename('Kahn 2014'),
    ] + [
        pred.rename(mdl.name)
        for mdl, pred in zip(cvPREvsDummyPRE.models, modelPredictions)
    ],
    invectedProximal[0].seq,
    invectedProximal[0].start,
    invectedProximal[0].end)

In [None]:
nc.plotGenomeTracks(
    [
        Dmel,
        HBMERgn.rename('HBME'),
        Kahn2014Rgn.rename('Kahn 2014'),
    ] + [
        pred.rename(mdl.name)
        for mdl, pred in zip(trainedModels, modelPredictions)
    ],
    invectedProximal[0].seq,
    invectedProximal[0].start,
    invectedProximal[0].end,
    outpath = "fig5InvectedLocus.pdf")