# ORF Density Notebook

- status: finished
- implementation for evolution necessary
- could also be necessary for better positional k-mers
- no real knowledge gained!!
- discussing with manja again
- other idea:
    - Kozaki Sequence Probability (ORF Start/non-ORF Start)
    - Frameshift Probability (Start, Stop anderer Frame) 
    - All Stop/Start Positions plus Frame 
    - Prediction by prior knowledge 
        - eg. 3 proteins, 2 shifts and 1 alternative init (kozaki)
        - calculate best 3 and weight probabilitys by occurence of events
        - alternative splicing completely unimportant
        - only the protein frames with shifts and alternative inits important

![ORF](ORF.svg)

In [5]:
from Bio import SeqIO
from Bio.Seq import Seq
import re

In [60]:
fasta = SeqIO.parse('A.fasta','fasta')

In [61]:
entry = next(fasta)
sequence =  str(entry.seq)
header = entry.name

In [62]:
regex = re.compile(r'ATG|TAA|TAG|TGA', flags=re.I | re.X)

In [63]:
codonlist = [(codon.group(0), codon.start(0)) for codon in regex.finditer(sequence)]

In [64]:
framelist = list(map(lambda x: (x[0], x[1], x[1]%3), a))

In [70]:
n = len(framelist)

f = 2
p = 2
k = 1

x = f/n
y = p/n
z = p/n + k/n

In [77]:
observations = (
    "StopORF", 
    "ORF", 
    "StartORF", 
    "NonORF")
states = (
    "Frame1", 
    "Frame2", 
    "Frame3")
start_p = {
    "Frame1":1/3, 
    "Frame2":1/3, 
    "Frame3":1/3}
trans_p = {
    "Frame1": {"Frame1": 1-2*x, "Frame2": x, "Frame3": x},
    "Frame2": {"Frame1": x, "Frame2": 1-2*x, "Frame3": x},
    "Frame3": {"Frame1": x, "Frame2": x, "Frame3": 1-2*x},
}
emit_p = {
    "Frame1": {
        0: {"StopORF":y, "ORF":1-y},
        1: {"StartORF":z, "NonORF":1-z},
    },
    
    "Frame2": {
        0: {"StopORF":y, "ORF":1-y},
        1: {"StartORF":z, "NonORF":1-z},
    },
    "Frame3": {
        0: {"StopORF":y, "ORF":1-y},
        1: {"StartORF":z, "NonORF":1-z},
    },
}

In [79]:
trans_p["Frame2"]

{'Frame1': 0.01639344262295082,
 'Frame2': 0.9672131147540983,
 'Frame3': 0.01639344262295082}

In [90]:
observations = ("normal", "cold", "dizzy", "normal", "normal", "cold", "dizzy", "cold", "dizzy", "cold", "dizzy")
states = ("Healthy", "Fever")
start_p = {"Healthy": 0.6, "Fever": 0.4}
trans_p = {
    "Healthy": {"Healthy": 0.7, "Fever": 0.3},
    "Fever": {"Healthy": 0.4, "Fever": 0.6},
}
emit_p = {
    "Healthy": {"normal": 0.5, "cold": 0.4, "dizzy": 0.1},
    "Fever": {"normal": 0.1, "cold": 0.3, "dizzy": 0.6},
}

In [91]:
def viterbi_algorithm(observations, states, start_p, trans_p, emit_p):
    V = [{}]
    for st in states:
        V[0][st] = {"prob": start_p[st] * emit_p[st][observations[0]], "prev": None}

    for t in range(1, len(observations)):
        V.append({})
        for st in states:
            max_tr_prob = V[t - 1][states[0]]["prob"] * trans_p[states[0]][st]
            prev_st_selected = states[0]
            for prev_st in states[1:]:
                tr_prob = V[t - 1][prev_st]["prob"] * trans_p[prev_st][st]
                if tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_st_selected = prev_st

            max_prob = max_tr_prob * emit_p[st][observations[t]]
            V[t][st] = {"prob": max_prob, "prev": prev_st_selected}
    for line in dptable(V):
        print(line)

    opt = []
    max_prob = 0.0
    best_st = None

    for st, data in V[-1].items():
        if data["prob"] > max_prob:
            max_prob = data["prob"]
            best_st = st
    opt.append(best_st)
    previous = best_st

    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][previous]["prev"])
        previous = V[t + 1][previous]["prev"]

    print ("The steps of states are " + " ".join(opt) + " with highest probability of %s" % max_prob)

def dptable(V):

    yield " ".join(("%12d" % i) for i in range(len(V)))
    for state in V[0]:
        yield "%.7s: " % state + " ".join("%.7s" % ("%f" % v[state]["prob"]) for v in V)

In [92]:
viterbi_algorithm(observations, states, start_p, trans_p, emit_p)

           0            1            2            3            4            5            6            7            8            9           10
Healthy: 0.30000 0.08400 0.00588 0.00302 0.00105 0.00029 0.00002 0.00000 0.00000 0.00000 0.00000
Fever: 0.04000 0.02700 0.01512 0.00090 0.00009 0.00009 0.00005 0.00001 0.00000 0.00000 0.00000
The steps of states are Healthy Healthy Fever Healthy Healthy Healthy Fever Fever Fever Fever Fever with highest probability of 2.2399090237439986e-07


In [2]:
import networkx as nx

In [3]:
G = nx.Graph()
G.add_nodes_from(["A", "B", "C", "D", "E"])
G.add_edges_from([("A", "B"), ("B", "C"), ("C", "D"), ("D", "E")])

In [12]:
G

<networkx.classes.graph.Graph at 0x7f443c5b3e80>

In [11]:
sp["A"]["E"]

['A', 'B', 'C', 'D', 'E']

In [8]:
spl = nx.all_pairs_shortest_path_length(G)