In [2]:
# Standard library imports
import sys
import os
from pathlib import Path
path = Path('..').resolve()
if (str(path) not in sys.path):
    sys.path.append(str(path))     ## Let's us import modules from dev

# External imports
import numpy as np
import pandas as pd
import holoviews as hv
from holoviews import dim         ## Requires python 3.7; not 3.5
from holoviews import opts

# Internal imports
from dev.src import io
from src import kmer
from src import ml

In [3]:
## **********
## *** The following parameters SHOULD NOT be changed
## **********
OUTPUT_DIR           = Path(os.environ['OUTPUT_DIR'])
FREQUENCY_OUTPUT_DIR = Path(os.environ['FREQUENCY_OUTPUT_DIR'])
DATA_DIR             = Path(os.environ['DATA_DIR'])

## For running notebooks locally
# OUTPUT_DIR           = Path('../../output')
# FREQUENCY_OUTPUT_DIR = Path(OUTPUT_DIR, 'frequency_tables')

In [4]:
## **********
## *** The following parameters CAN be changed
## **********
## Whole genome (WG) Kmer frequencies
DR_WG_DIR = Path(FREQUENCY_OUTPUT_DIR, 'default/fasta/ref_genomes/danio_rerio')
DM_WG_DIR = Path(FREQUENCY_OUTPUT_DIR, 'default/fasta/ref_genomes/drosophila_melanogaster')
HS_WG_DIR = Path(FREQUENCY_OUTPUT_DIR, 'default/fasta/ref_genomes/homo_sapiens')
MM_WG_DIR = Path(FREQUENCY_OUTPUT_DIR, 'default/fasta/ref_genomes/mus_musculus')
SC_WG_DIR = Path(FREQUENCY_OUTPUT_DIR, 'default/fasta/ref_genomes/saccharomyces_cerevisiae')
CG_WG_DIR = Path(FREQUENCY_OUTPUT_DIR, 'default/fasta/cas_genes/cas_genes')

## Random window genome (RWG) Kmer frequencies
SPCAS_RWG_DIR = Path(FREQUENCY_OUTPUT_DIR, 'randomisation/fastq/ref_genomes_cas_genes/homo_sapiens_10_SpCas9')

## Sequential window genomes (SWG) Kmer frequencies 

In [5]:
## **********
## *** Investigate the effect of different Kmer parameters 
## *** by analysing the whole genome Kmer frequencies.
## **********
hv.extension('bokeh')
plots = hv.HoloMap(kdims='kmer')
DIRS  = [DR_WG_DIR, DM_WG_DIR, HS_WG_DIR, MM_WG_DIR, SC_WG_DIR, CG_WG_DIR]

for i in range(1, 6):            ## Update range when necessary
    kStr     = str(i) +'mer'
    kmerDirs = [str(p) + "_" + kStr for p in DIRS]
    print(kStr)

    ## Read Kmer frequencies
    kmerDf = io.kmer.read(*kmerDirs)
    (kmerId, kmerCounts) = kmer.rotateAndSplit(kmerDf)

    ## Perform feature selection & reduction
    kmerCounts         = ml.feature.select(kmerId, kmerCounts)
    (pca, pcaColNames) = ml.feature.reduce(kmerCounts)

    ## Create the analysis table
    kmerPca = ml.joinColumns(kmerId, pcaColNames, pca)

    ## Table formatting
    kmerPca[kmer.FILE_COL_NAME] = kmerPca[kmer.FILE_COL_NAME].str.replace('_' + kStr, '')

    ## Plot the figure
    d = hv.Dataset(kmerPca, ml.PCA_DATA_COL_NAMES)
    s = d.to(hv.Scatter, ml.PCA_DATA_COL_NAMES, groupby=kmer.FILE_COL_NAME).overlay()

    ## Style plots
    options = [opts.Scatter(tools=['hover'], width=700, height=700, size=10)]
    s.opts(options)
    plots[i] = s

plots = plots.collate()
plots

1mer
2mer
3mer
4mer
5mer


In [6]:
## **********
## *** Investigate the effect of different Kmer parameters 
## *** by analysing the whole genome Kmer frequencies with random window Kmer frequencies
## **********

hv.extension('bokeh')
plots = hv.HoloMap(kdims=['windowsize', 'kmer'])
DIRS  = [SPCAS_RWG_DIR]

for windowSize in [250, 2500, 25000]:
    for i in range(1, 6):            ## Update range when necessary
        windowStr = str(windowSize) + 'bp'
        kStr      = str(i) +'mer'
        dataStr   = "_".join([windowStr, kStr])
        kmerDirs  = [str(p) + "_" + dataStr for p in DIRS]
        print(dataStr)

        ## Read Kmer frequencies
        kmerDf = io.kmer.read(*kmerDirs)
        (kmerId, kmerCounts) = kmer.rotateAndSplit(kmerDf)

        ## Perform feature selection & reduction
        kmerCounts         = ml.feature.select(kmerId, kmerCounts)
        (pca, pcaColNames) = ml.feature.reduce(kmerCounts)

        ## Create the analysis table
        kmerPca = ml.joinColumns(kmerId, pcaColNames, pca)

        ## Table formatting
        kmerPca[kmer.FILE_COL_NAME] = kmerPca[kmer.FILE_COL_NAME].str.replace('_' + kStr, '')
        kmerPca[kmer.FILE_COL_NAME] = kmerPca[kmer.FILE_COL_NAME].str.replace('_' + windowStr, '')

        ## Plot the figure
        d = hv.Dataset(kmerPca, ml.PCA_DATA_COL_NAMES)
        s = d.to(hv.Scatter, ml.PCA_DATA_COL_NAMES, groupby=kmer.FILE_COL_NAME).overlay()

        ## Style plots
        options = [opts.Scatter(tools=['hover'], width=700, height=700, size=3)]
        s.opts(options)
        plots[(windowSize, i)] = s

plots = plots.collate()
plots

250bp_1mer
250bp_2mer
250bp_3mer
250bp_4mer
250bp_5mer
2500bp_1mer
2500bp_2mer
2500bp_3mer
2500bp_4mer
2500bp_5mer
25000bp_1mer
25000bp_2mer
25000bp_3mer
25000bp_4mer
25000bp_5mer
