In [6]:
%load_ext autoreload
%autoreload 2

# IMPORTS

In [9]:
import numpy as np

from src.features.motif import (motif_enumeration, 
                                motif_entropy, 
                                median_string,
                                most_probable_kmer,
                                compute_pr,
                                greedy_motif_search,
                                greedy_motif_search_with_pseudocounts,
                                distance_between_pattern_and_strings)
from src.data.io import read_text_from_file, print_list

# PATHS & NAMES

In [8]:
RAWFILE_FOLDER = "../data/raw/bi_i/w3"
EDA_REPORT_FOLDER = "../reports"

# TESTS

## test 1

In [17]:
params = read_text_from_file(f"{RAWFILE_FOLDER}/dataset1.txt")
print_list(motif_enumeration(params['param_2'].split(" "),
                             int(params['param_1'].split(" ")[0]), 
                             int(params['param_1'].split(" ")[1])))

TGTGC TGTCC TGTAC TGTTC


## test 2

In [34]:
params = read_text_from_file(f"{RAWFILE_FOLDER}/dataset2.txt")
median_string(int(params['param_1']), [s for k, s in params.items() if k != "param_1"])

'CAGGCC'

## test 3

In [93]:
params = read_text_from_file(f"{RAWFILE_FOLDER}/dataset3.txt")
most_probable_kmer(params["param_1"],
                   int(params["param_2"]),
                   [[float(i) for i in s.split(" ")] for k, s in params.items() if k != "param_1" and k != "param_2"])

'GGCATGAATGCGGC'

## test

In [10]:
Profile = {
    "A": [0.4, 0.3, 0.0, 0.1, 0.0, 0.9],
    "C": [0.2, 0.3, 0.0, 0.4, 0.0, 0.1],
    "G": [0.1, 0.3, 1.0, 0.1, 0.5, 0.0],
    "T": [0.3, 0.1, 0.0, 0.4, 0.5, 0.0],
}

sequence = "CAGTGA"
compute_pr(sequence, Profile)

0.0108

## test 4

In [97]:
params = read_text_from_file(f"{RAWFILE_FOLDER}/dataset4.txt")
print_list(greedy_motif_search(params["param_2"].split(" "),
                               int(params['param_1'].split(" ")[0]), 
                               int(params['param_1'].split(" ")[1])))

GGTGCGGTTGCC TTACCCGTAAAT GCGGTTGACAAA TTTCCCGTGAAG TTACCCGTGAAG TGACCCGTGAAC TGGCCCGTGAAA CCCCCTGAGTAT CACAGTGACCAC TTTCCCGTGAAC TTTCCCGTTAAA TTCCCCGTTAAC TACCCCGTAAAC TGTCCCGTAAAC TCGCCCGTCAAC TACCCCGTTAAC TAGCCCGTGAAA TACCCCGTAAAG TTACCCGTAAAA TCTCCCGTTAAG TGTCCCGTCAAT TTCCCCGTAAAT TTCCCCGTTAAT TGACCCGTGAAG TCTCCCGTAAAA


## test 5

In [102]:
params = read_text_from_file(f"{RAWFILE_FOLDER}/dataset5.txt")
print_list(greedy_motif_search_with_pseudocounts(params["param_2"].split(" "),
                               int(params['param_1'].split(" ")[0]), 
                               int(params['param_1'].split(" ")[1])))

AAAACATACGGG ATAAACTACGGG ATAAAATACGGG ATAAGTTGCGGG ATAACGTGCGGG AGAAGCTCCGGG ATAAGGTACGGG ATAACTTTCGGG ATAAATTCCGGG AGAATCTGCGGG AGAAGCTGCGGG ATAATCTCCGGG ATAAAATCCGGG AGAAGTTCCGGG AAAAAATTCGGG AAAATGTCCGGG AAAAACTTCGGG ATAAAATACGGG AGAAACTCCGGG ATAACGTCCGGG AAAATCTTCGGG ACAAGGTTCGGG ACAATGTCCGGG AGAACGTGCGGG AAAACTTCCGGG


## test 6

In [108]:
params = read_text_from_file(f"{RAWFILE_FOLDER}/dataset6.txt")
distance_between_pattern_and_strings(params["param_1"],
                                     params["param_2"].split(" "))

77

## test 7

In [96]:
params = read_text_from_file(f"{RAWFILE_FOLDER}/dataset7.txt")
print_list(neighbors(params["param_1"], int(params["param_2"])))

GAATGCCC GTAAACCC GAATTGCC GCATCCCC GCATTGCC ATATGCCC ATATTTCC GTATCCGC GTACCCCC GTATACCA GTTATCCC GTATTGGC GTATTCCG GTATTGAC GTCTTACC CTATTCCG GTACTCCG GCATTCCG TCATTCCC GTATTGCC GAACTCCC AAATTCCC AGATTCCC GAATTACC GTGTACCC GTATTCAC GTAATCTC GTAATCCT TTACTCCC GCTTTCCC GTAGTCAC GTATTCCA GTCTTCCG GTTTTGCC GCATTTCC GTTTTCCC GTATACCT TTATTCCA GTATACGC TTCTTCCC GTGTTCCG ATATTCCT TTTTTCCC ATAATCCC GGAGTCCC GTTTGCCC CTATTTCC ATATTGCC ATATTCGC GAATACCC GTAATCCC ATGTTCCC GTCGTCCC GTTTTCCT GCACTCCC GTAGTCTC GAATTTCC GAATTCGC GCATTCCT GTGTGCCC TTATTCTC GACTTCCC GTGTTCCC GTTTTCAC GTATTACG GTAGTCCA CTATTCCA GCGTTCCC GTATACTC GGATTCCC GGATTCCA GCATTCCC TTATTGCC GTATTAAC GTATTGCT GTCTTCTC CTATTCTC TTATTCCG GGATCCCC GTGCTCCC CTATACCC GTACACCC GTATACAC GCATTCAC GTTTTTCC GTGTTCCA GTAGTCGC TTATTCCC GTATGGCC GTGGTCCC CTATCCCC GGAATCCC ATATTCTC GAGTTCCC GTATTACC ATTTTCCC GTACTCCC GTATATCC GTATTCAT ATATTCAC GTTTTCCG GAATTCCG GTATTCCC GTATTTAC GTATGCGC GTAACCCC GTATTCAA GTATTATC GTAATACC GTAATTCC TTAATCCC T

## entropy

In [None]:
import numpy as np
from scipy.stats import entropy
base = 2

In [4]:
pk = np.array([0.5, 0,0, 0,5])
entropy(pk, base=base)

0.43949698692151334

In [5]:
pk = np.array([0.25, 0.25, 0.25, 0.25])
entropy(pk, base=base)

2.0

In [6]:
pk = np.array([0, 0, 0, 1])
entropy(pk, base=base)

0.0

In [7]:
pk = np.array([0.25, 0, 0.5, 0.25])
entropy(pk, base=base)

1.5