### Imports

In [None]:
import plotly.express as px 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd 
import os 
import plotly.io as pio
pio.kaleido.scope.mathjax = None 
from math import log

# <b>VLMC Analysis</b>
This notebook is devoted to analysing VLMCs. 

## Amount of matching kmers when comparing two VLMCs
Here we count which kmers are matched and not matched in two VLMCs that are compared by our distance calculation. By design the distance calculation selects the left VLMC to be the shorted of the two VLMCs. 

In [None]:
iloc_x = 1000
rolling_x = 10000

df_left = pd.read_csv("./tmp/distributions/left_distribution_turkey_to_human.txt", sep=",", header=None)
df_right = pd.read_csv("./tmp/distributions/right_distribution_turkey_to_human.txt", sep=",", header=None)
df_left.columns = ['match']
df_left['not_match'] = df_left.match != 1
df_left['in_a_row'] = df_left['not_match'].cumsum()-df_left['not_match'].cumsum().where(~df_left['not_match']).ffill().fillna(0).astype(int)

df_right.columns = ['match']
df_right['not_match'] = df_right.match != 1
df_right['in_a_row'] = df_right['not_match'].cumsum()-df_right['not_match'].cumsum().where(~df_right['not_match']).ffill().fillna(0).astype(int)

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_left.iloc[::iloc_x].index, y=df_left.iloc[::iloc_x].in_a_row, name="Left"))
fig.add_trace(go.Scatter(x=df_right.iloc[::iloc_x].index, y=df_right.iloc[::iloc_x].in_a_row, name="Right"))
fig.update_layout(title="Amount of kmers missed in a row for left and right VLMC (Left = Turkey, Right = human)", yaxis_title="Missed in a row", xaxis_title='Kmer')
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_left.iloc[::iloc_x].index, y=df_left.in_a_row.rolling(rolling_x).mean().iloc[::iloc_x], name="Left"))
fig.add_trace(go.Scatter(x=df_right.iloc[::iloc_x].index, y=df_right.in_a_row.rolling(rolling_x).mean().iloc[::iloc_x], name="Right"))
fig.update_layout(title="Rolling average of amount of kmers missed in a row for left and right VLMC (Left = Turkey, Right = human)", yaxis_title="Average missed in a row", xaxis_title='Kmer')
fig.show()

df_left = pd.read_csv("./tmp/distributions/left_distribution_human_to_human.txt", sep=",", header=None)
df_right = pd.read_csv("./tmp/distributions/right_distribution_human_to_human.txt", sep=",", header=None)
df_left.columns = ['match']
df_left['not_match'] = df_left.match != 1
df_left['in_a_row'] = df_left['not_match'].cumsum()-df_left['not_match'].cumsum().where(~df_left['not_match']).ffill().fillna(0).astype(int)

df_right.columns = ['match']
df_right['not_match'] = df_right.match != 1
df_right['in_a_row'] = df_right['not_match'].cumsum()-df_right['not_match'].cumsum().where(~df_right['not_match']).ffill().fillna(0).astype(int)

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_left.iloc[::iloc_x].index, y=df_left.iloc[::iloc_x].in_a_row, name="Left"))
fig.add_trace(go.Scatter(x=df_right.iloc[::iloc_x].index, y=df_right.iloc[::iloc_x].in_a_row, name="Right"))
fig.update_layout(title="Amount of kmers missed in a row for left and right VLMC (Left = Human, Right = human)", yaxis_title="Missed in a row", xaxis_title='Kmer')
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_left.iloc[::iloc_x].index, y=df_left.in_a_row.rolling(rolling_x).mean().iloc[::iloc_x], name="Left"))
fig.add_trace(go.Scatter(x=df_right.iloc[::iloc_x].index, y=df_right.in_a_row.rolling(rolling_x).mean().iloc[::iloc_x], name="Right"))
fig.update_layout(title="Rolling average of amount of kmers missed in a row for left and right VLMC (Left = Human, Right = human)", yaxis_title="Average missed in a row", xaxis_title='Kmer')
fig.show()

## Percentage of matching kmers 
Once again when comparing two VLMCs, this graph displays the amount of kmers in each VLMC that is matched with the other VLMC.

In [None]:
df_left = pd.read_csv("./tmp/distributions/left_distribution_turkey_to_human.txt", sep=",", header=None)
df_right = pd.read_csv("./tmp/distributions/right_distribution_turkey_to_human.txt", sep=",", header=None)
df_left.columns = ['match']
df_left['ones'] = 1
df_left['matchsum'] = df_left.match.cumsum()
df_left['prc_matches'] = df_left.matchsum / df_left.ones.cumsum()

df_right.columns = ['match']
df_right['ones'] = 1
df_right['matchsum'] = df_right.match.cumsum()
df_right['prc_matches'] = df_right.matchsum / df_right.ones.cumsum()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_left.iloc[::1000].index, y=df_left.iloc[::1000].prc_matches, name="Left"))
fig.add_trace(go.Scatter(x=df_right.iloc[::1000].index, y=df_right.iloc[::1000].prc_matches, name="Right"))
fig.update_layout(title="Percentage of matches (Left = Turkey, Right = Human)", yaxis_title="percentage", xaxis_title="Kmer")
fig.show()

df_left = pd.read_csv("./tmp/distributions/left_distribution_human_to_human.txt", sep=",", header=None)
df_right = pd.read_csv("./tmp/distributions/right_distribution_human_to_human.txt", sep=",", header=None)
df_left.columns = ['match']
df_left['ones'] = 1
df_left['matchsum'] = df_left.match.cumsum()
df_left['prc_matches'] = df_left.matchsum / df_left.ones.cumsum()

df_right.columns = ['match']
df_right['ones'] = 1
df_right['matchsum'] = df_right.match.cumsum()
df_right['prc_matches'] = df_right.matchsum / df_right.ones.cumsum()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_left.iloc[::1000].index, y=df_left.iloc[::1000].prc_matches, name="Left"))
fig.add_trace(go.Scatter(x=df_right.iloc[::1000].index, y=df_right.iloc[::1000].prc_matches, name="Right"))
fig.update_layout(title="Percentage of matches (Left = Human, Right = Human)", yaxis_title="percentage", xaxis_title="Kmer")
fig.show()

## Counting percentage of length of kmers that is filled
For each length of a kmer there can at most exist 4 to the power of the length amount of different Kmers. Here we examine how high of a percentage each length has filled for different DNA datasets.

In [None]:
df_test_VLMCs = pd.read_csv("./tmp/length_distributions/human_small_kmer-distribution.txt", sep=",", header=None)
df_test_VLMCs.columns = ['input']
df_test_VLMCs['vlmc'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[0])
df_test_VLMCs['kmer_length'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[1])
df_test_VLMCs['cover_prc'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[2]).astype(float)

fig = px.line(df_test_VLMCs, x='kmer_length', y='cover_prc', color='vlmc')
fig.show()

df_test_VLMCs = pd.read_csv("./tmp/length_distributions/human_medium_kmer-distribution.txt", sep=",", header=None)
df_test_VLMCs.columns = ['input']
df_test_VLMCs['vlmc'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[0])
df_test_VLMCs['kmer_length'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[1])
df_test_VLMCs['cover_prc'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[2]).astype(float)

fig = px.line(df_test_VLMCs, x='kmer_length', y='cover_prc', color='vlmc')
fig.show()

df_test_VLMCs = pd.read_csv("./tmp/length_distributions/human_large_kmer-distribution.txt", sep=",", header=None)
df_test_VLMCs.columns = ['input']
df_test_VLMCs['vlmc'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[0])
df_test_VLMCs['kmer_length'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[1])
df_test_VLMCs['cover_prc'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[2]).astype(float)

fig = px.line(df_test_VLMCs, x='kmer_length', y='cover_prc', color='vlmc')
fig.show()

df_test_VLMCs = pd.read_csv("./tmp/length_distributions/human_diverse_kmer-distribution.txt", sep=",", header=None)
df_test_VLMCs.columns = ['input']
df_test_VLMCs['vlmc'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[0])
df_test_VLMCs['kmer_length'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[1])
df_test_VLMCs['cover_prc'] = df_test_VLMCs.input.apply(lambda x : x.split('_')[2]).astype(float)

fig = px.line(df_test_VLMCs, x='kmer_length', y='cover_prc', color='vlmc')
fig.show()

## Similar to above but examining single VLMC with its actual kmer string

In [None]:
df_human = pd.read_csv("./tmp/one_vlmcs_kmer-distribution.txt", sep=",", header=None)
df_turkey = pd.read_csv("./tmp/another_vlmcs_kmer-distribution.txt", sep=",", header=None)

df_human.columns = ['kmer_string']
df_turkey.columns = ['kmer_string']

df_human['kmer_string'] = df_human.kmer_string.astype(str)
df_turkey['kmer_string'] = df_turkey.kmer_string.astype(str)

df_human['start_letter'] = df_human.kmer_string.apply(lambda x : x[0])
df_turkey['start_letter'] = df_turkey.kmer_string.apply(lambda x : x[0])

df_human['kmer_length'] = df_human.kmer_string.apply(lambda x : len(x))
df_turkey['kmer_length'] = df_turkey.kmer_string.apply(lambda x : len(x))

df_human.sort_values('kmer_length', inplace=True)
df_turkey.sort_values('kmer_length', inplace=True)

df_human['one'] = 1
df_human['new_index'] = df_human.one.cumsum()

df_turkey['one'] = 1
df_turkey['new_index'] = df_turkey.one.cumsum()

fig = px.scatter(x=df_human.iloc[::1000].new_index, y=df_human.iloc[::1000].kmer_length, color=df_human.iloc[::1000].start_letter)
fig.show()

fig = px.scatter(x=df_turkey.iloc[::1000].new_index, y=df_turkey.iloc[::1000].kmer_length, color=df_turkey.iloc[::1000].start_letter)
fig.show()
divlist = []
x_list = []
for i in range(1,11):
    divlist.append(4**i)
    x_list.append(i)
filled_human = list(df_human.groupby('kmer_length').kmer_length.count() / divlist) 
filled_turkey = list(df_turkey.groupby('kmer_length').kmer_length.count() / divlist)

fig = go.Figure()
fig.add_trace(go.Scatter(x=x_list, y=filled_human, name='human'))
fig.add_trace(go.Scatter(x=x_list, y=filled_turkey, name='turkey'))
fig.show()