# Progress Report on alignment-free issue 
## 16/1/23
- realise the co-distance measurement.

In [1]:
from cogent3 import load_aligned_seqs, make_seq
import plotly.express as px
import numpy 

In [2]:
def get_kmers(seq:str, kmer:int) -> list:
    """
    Use a slide window to get k-mers in a string format sequence. 
    Return a list of k-mers.
    """
    return [seq[poi:poi + kmer] for poi in range(len(seq) - kmer + 1)]

def context_obj_pair(seq,context_len,obj_len) -> tuple:
    """
    Pick middle sites and their flanking strings. 
    """
    C_gram = seq[:context_len] + seq[context_len+obj_len:]
    O_gram = seq[context_len:context_len+obj_len]
    return C_gram, O_gram

def check_repeat_struct(S_pair, C_gram, O_gram):
    """
    Update the context-object dict. which indexed by the context.
    """
    if C_gram in S_pair:
        if O_gram not in S_pair[C_gram]:
            S_pair[C_gram].append(O_gram)
    else:
        S_pair[C_gram] = [O_gram]

def all_pairs(genome,k,context_len,obj_len) -> dict:
    """
    Generate all context-object pairs in a genome.
    """
    k_mers = set(get_kmers(genome, kmer=k))
    S_pair = {}
    for mer in k_mers:
        seq = make_seq(mer, moltype="dna")
        reversed_seq = str(seq.rc())

        C_gram, O_gram = context_obj_pair(mer,context_len,obj_len)
        check_repeat_struct(S_pair, C_gram, O_gram)

        C_gram_rc, O_gram_rc = context_obj_pair(reversed_seq,context_len,obj_len)
        check_repeat_struct(S_pair, C_gram_rc, O_gram_rc)

    return {key:val for key, val in S_pair.items() if len(val) == 1 }

def co_distance(genome1, genome2, k,context_len,obj_len) -> float:
    """
    Given two genomes, compute the co-distance.
    """
    genome1, genome2 = (genome1, genome2) if len(genome1) < len(genome2) else (genome2, genome1)
    S_pair1 = all_pairs(genome1,k,context_len,obj_len)
    S_pair2 = all_pairs(genome2,k,context_len,obj_len)
    r=0
    i=0
    for C_gram in S_pair1:
        if C_gram in S_pair2:
            r+=1
            if S_pair2[C_gram] != S_pair1[C_gram]:
                i+=1
    return 0 if r == 0 else i/r




#wrap in matrix
def pair_dco(seq_coll:dict, pair_distance, k,context_len,obj_len) -> numpy.ndarray: 
    """
    Get a pairwise co-distance matrix. 
    """
    seq_names = pair_distance.names
    num_seqs = len(seq_names)
    dcos = numpy.zeros((num_seqs, num_seqs), dtype=float)

    #calculate dco matrix
    for x in range(num_seqs-1):
        name1 = seq_names[x]
        for y in range(x + 1, num_seqs):
            name2 = seq_names[y]
            dist = co_distance(seq_coll[name1], seq_coll[name2], k,context_len,obj_len) 
            dcos[x,y] = dist
            dcos[y,x] = dist

    return dcos


def pair_len_diff(seq_coll:dict, pair_distance) -> numpy.ndarray: 
    """
    Get a length difference matrix.
    """
    seq_names = pair_distance.names
    num_seqs = len(seq_names)
    abs_diffs = numpy.zeros((num_seqs, num_seqs), dtype=float)

    #calculate length difference matrix
    for x in range(num_seqs-1):
        name1 = seq_names[x]
        for y in range(x + 1, num_seqs):
            name2 = seq_names[y]
            abs_diff = abs(len(seq_coll[name1]) - len(seq_coll[name2]))
            abs_diffs[x,y] = abs_diff
            abs_diffs[y,x] = abs_diff

    return abs_diffs

## Load sample

In [3]:
aln = load_aligned_seqs("~/repos/Cogent3/tests/data/brca1.fasta", moltype="dna")
#aln = aln.no_degenerates() ##numeric test

In [4]:
pair_distance = aln.distance_matrix()
pair_distance

names,Aardvark,AfricanEl,Anteater,AsianElep,Bandicoot,Caenolest,Cat,Chimpanzee,Cow,Dog,DogFaced,Dugong,FalseVamp,FlyingFox,FlyingLem,FlyingSqu,FreeTaile,Galago,GiantElep,GoldenMol,Gorilla,HairyArma,Hedgehog,Hippo,Horse,HowlerMon,Human,HumpbackW,Jackrabbit,LeafNose,LesserEle,LittleBro,Llama,Madagascar,Manatee,Mole,Mouse,NineBande,OldWorld,Orangutan,Pangolin,Phascogale,Pig,Rat,Rhesus,Rhino,RockHyrax,RoundEare,Sloth,SpermWhale,Tenrec,TombBat,TreeHyrax,TreeShrew,Wombat
Aardvark,0.0000,0.1182,0.1665,0.1203,0.3989,0.4081,0.1775,0.1701,0.2060,0.1963,0.1842,0.1175,0.1860,0.1719,0.1620,0.1871,0.1706,0.2045,0.2051,0.1543,0.1697,0.1503,0.2335,0.1844,0.1633,0.1805,0.1712,0.1774,0.2082,0.1766,0.1872,0.1848,0.1724,0.2410,0.1144,0.2045,0.2955,0.1634,0.2960,0.1708,0.1788,0.3896,0.1914,0.2902,0.1775,0.1542,0.1726,0.2596,0.1529,0.1788,0.2696,0.1808,0.1741,0.1983,0.3888
AfricanEl,0.1182,0.0000,0.1496,0.0043,0.3925,0.3973,0.1675,0.1552,0.1918,0.1841,0.1729,0.0755,0.1674,0.1624,0.1481,0.1763,0.1623,0.1937,0.1986,0.1457,0.1549,0.1316,0.2175,0.1681,0.1449,0.1646,0.1574,0.1596,0.1976,0.1648,0.1756,0.1686,0.1559,0.2388,0.0753,0.1898,0.2898,0.1486,0.2782,0.1552,0.1625,0.3798,0.1806,0.2800,0.1604,0.1406,0.1414,0.2434,0.1418,0.1619,0.2559,0.1677,0.1409,0.1848,0.3758
Anteater,0.1665,0.1496,0.0000,0.1503,0.3998,0.4051,0.1590,0.1555,0.1892,0.1782,0.1674,0.1374,0.1694,0.1583,0.1405,0.1721,0.1582,0.1930,0.2333,0.1729,0.1555,0.1019,0.2167,0.1651,0.1466,0.1658,0.1591,0.1536,0.1965,0.1596,0.2168,0.1701,0.1600,0.2592,0.1363,0.1929,0.2851,0.1134,0.2774,0.1577,0.1608,0.3894,0.1767,0.2779,0.1614,0.1388,0.2020,0.2471,0.0934,0.1550,0.2816,0.1672,0.2006,0.1851,0.3806
AsianElep,0.1203,0.0043,0.1503,0.0000,0.3933,0.3978,0.1679,0.1563,0.1937,0.1842,0.1748,0.0764,0.1685,0.1636,0.1488,0.1771,0.1627,0.1942,0.1986,0.1464,0.1556,0.1330,0.2195,0.1688,0.1449,0.1653,0.1585,0.1611,0.1992,0.1655,0.1749,0.1693,0.1578,0.2395,0.0762,0.1903,0.2891,0.1504,0.2801,0.1563,0.1622,0.3798,0.1818,0.2792,0.1612,0.1403,0.1418,0.2442,0.1425,0.1635,0.2558,0.1685,0.1412,0.1867,0.3758
Bandicoot,0.3989,0.3925,0.3998,0.3933,0.0000,0.2077,0.4121,0.4007,0.4234,0.4141,0.4195,0.3878,0.4229,0.4121,0.3981,0.4067,0.4082,0.4128,0.4286,0.4076,0.4011,0.3913,0.4360,0.4192,0.3996,0.4105,0.4048,0.4140,0.4195,0.4113,0.4122,0.4076,0.4089,0.4458,0.3905,0.4175,0.4541,0.3916,0.4654,0.4028,0.4114,0.1828,0.4137,0.4515,0.4093,0.3997,0.4163,0.4497,0.3904,0.4171,0.4609,0.4062,0.4165,0.4024,0.1503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tenrec,0.2696,0.2559,0.2816,0.2558,0.4609,0.4598,0.2894,0.2823,0.3031,0.3037,0.3061,0.2449,0.3034,0.2965,0.2737,0.3014,0.2960,0.3216,0.3198,0.2626,0.2812,0.2730,0.3335,0.2868,0.2770,0.2903,0.2845,0.2848,0.3101,0.2952,0.3009,0.2977,0.2896,0.1332,0.2466,0.3102,0.3633,0.2867,0.3694,0.2838,0.2970,0.4537,0.2941,0.3610,0.2896,0.2746,0.3009,0.3451,0.2775,0.2845,0.0000,0.2993,0.3024,0.3042,0.4385
TombBat,0.1808,0.1677,0.1672,0.1685,0.4062,0.4232,0.1482,0.1622,0.1743,0.1659,0.1361,0.1586,0.1333,0.1245,0.1450,0.1738,0.0956,0.2009,0.2396,0.2034,0.1622,0.1571,0.2051,0.1520,0.1299,0.1677,0.1640,0.1392,0.1950,0.1272,0.2245,0.1108,0.1460,0.2783,0.1585,0.1785,0.2948,0.1658,0.2886,0.1651,0.1405,0.4005,0.1566,0.2915,0.1675,0.1200,0.2145,0.1964,0.1545,0.1404,0.2993,0.0000,0.2114,0.1898,0.3926
TreeHyrax,0.1741,0.1409,0.2006,0.1412,0.4165,0.4211,0.2152,0.2041,0.2378,0.2245,0.2171,0.1302,0.2172,0.2059,0.1893,0.2168,0.2074,0.2394,0.2445,0.1949,0.2044,0.1834,0.2570,0.2213,0.1985,0.2103,0.2066,0.2106,0.2351,0.2095,0.2242,0.2172,0.2114,0.2764,0.1323,0.2347,0.3137,0.2007,0.3189,0.2037,0.2134,0.4023,0.2266,0.3079,0.2112,0.1917,0.0135,0.2875,0.1919,0.2147,0.3024,0.2114,0.0000,0.2312,0.4052
TreeShrew,0.1983,0.1848,0.1851,0.1867,0.4024,0.4070,0.1816,0.1632,0.2010,0.2016,0.1942,0.1739,0.1966,0.1814,0.1573,0.1836,0.1824,0.1953,0.2513,0.2134,0.1646,0.1675,0.2351,0.1867,0.1697,0.1798,0.1680,0.1753,0.2042,0.1914,0.2328,0.1918,0.1778,0.2854,0.1759,0.1987,0.2874,0.1827,0.3104,0.1683,0.1790,0.3872,0.2004,0.2841,0.1700,0.1604,0.2325,0.2651,0.1780,0.1772,0.3042,0.1898,0.2312,0.0000,0.3834


In [5]:
seq_coll = aln.degap()
data = seq_coll.to_dict()  #get unaligned sequences collection

## Plot out the error between co-distance and fractional difference

In [6]:
dcos = pair_dco(seq_coll=data, pair_distance=pair_distance, k=19,context_len=9,obj_len=1)
length_diff = pair_len_diff(data, pair_distance)

In [7]:
num_seqs = pair_distance.shape[0]
indices = numpy.tril_indices(num_seqs)

dcos = dcos[indices]
frac_diff = pair_distance.array[indices] 
length_diff = length_diff[indices]
errors = dcos - frac_diff

In [8]:
fig = px.scatter(x=length_diff, y=errors,
    labels={"x": "absolute difference in length", "y": "co_distance - fraction_diff"}, 
    title = "Errors v.s. absolute length difference <br>data without gaps; `length of C_gram` = 18, `length of O_gram` = 1.")
fig.show()

In [9]:
fig = px.scatter(x=frac_diff, y=dcos,
    labels={"x": "fractional difference", "y": "co_distance"}, 
    title = "Comparison between co_distance and fractional difference <br>data without gaps `length of C_gram` = 18, `length of O_gram` = 1.")
fig.show()