# Progress Report on ani
## 17/1/23

- Fixed the calculation of ani.

- Checked the performance of ani with degapped data.

## Setup and Compute

In [1]:
import numpy 
from cogent3 import load_aligned_seqs
from collections import Counter
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
def kfreqs(seq, k):
    return Counter([seq[i: i+k] for i in range(len(seq) - k + 1)])

#modify to correct the count#
def diff_freqs(f1: Counter, f2: Counter) -> int:
    # f1 is always smaller
    c = 0
    
    for k in f1:
        d = f1[k] - f2[k] 
        if d > 0:
            c += d
    
    return c

def ani(s1, s2, k):
    
    s1, s2 = (s1, s2) if len(s1) < len(s2) else (s2, s1)
    f1 = kfreqs(s1, k)
    f2 = kfreqs(s2, k)
    delta = diff_freqs(f1, f2)
    return  delta / k / len(s1)


#update to get the ani matrix
def pair_ani(seq_coll:dict, pair_distance, k) -> numpy.ndarray: 
    """
    Get a pairwise ani distance matrix. 
    """
    seq_names = pair_distance.names
    num_seqs = len(seq_names)
    anis = numpy.zeros((num_seqs, num_seqs), dtype=float)

    #calculate ani matrix
    for x in range(num_seqs-1):
        name1 = seq_names[x]
        for y in range(x + 1, num_seqs):
            name2 = seq_names[y]
            dist = ani(seq_coll[name1], seq_coll[name2], k) 
            anis[x,y] = dist
            anis[y,x] = dist

    return anis

def pair_len_diff(seq_coll:dict, pair_distance) -> numpy.ndarray: 
    """
    Get a length difference matrix.
    """
    seq_names = pair_distance.names
    num_seqs = len(seq_names)
    abs_diffs = numpy.zeros((num_seqs, num_seqs), dtype=float)

    #calculate length difference matrix
    for x in range(num_seqs-1):
        name1 = seq_names[x]
        for y in range(x + 1, num_seqs):
            name2 = seq_names[y]
            abs_diff = abs(len(seq_coll[name1]) - len(seq_coll[name2]))
            abs_diffs[x,y] = abs_diff
            abs_diffs[y,x] = abs_diff

    return abs_diffs

In [3]:
aln = load_aligned_seqs("~/repos/Cogent3/tests/data/brca1.fasta", moltype="dna")
aln_degap = aln.no_degenerates() 

In [4]:
pair_distance = aln.distance_matrix()
pair_distance

pair_distance_degap = aln_degap.distance_matrix()
pair_distance_degap

names,Aardvark,AfricanEl,Anteater,AsianElep,Bandicoot,Caenolest,Cat,Chimpanzee,Cow,Dog,DogFaced,Dugong,FalseVamp,FlyingFox,FlyingLem,FlyingSqu,FreeTaile,Galago,GiantElep,GoldenMol,Gorilla,HairyArma,Hedgehog,Hippo,Horse,HowlerMon,Human,HumpbackW,Jackrabbit,LeafNose,LesserEle,LittleBro,Llama,Madagascar,Manatee,Mole,Mouse,NineBande,OldWorld,Orangutan,Pangolin,Phascogale,Pig,Rat,Rhesus,Rhino,RockHyrax,RoundEare,Sloth,SpermWhale,Tenrec,TombBat,TreeHyrax,TreeShrew,Wombat
Aardvark,0.0000,0.1215,0.1701,0.1230,0.3910,0.3984,0.1686,0.1620,0.2010,0.1996,0.1892,0.1193,0.1841,0.1701,0.1598,0.1804,0.1745,0.2018,0.2194,0.1642,0.1620,0.1458,0.2297,0.1922,0.1627,0.1672,0.1635,0.1694,0.1981,0.1797,0.1892,0.1907,0.1701,0.2408,0.1178,0.2069,0.2879,0.1613,0.3071,0.1620,0.1723,0.3800,0.1937,0.2776,0.1694,0.1487,0.1878,0.2629,0.1568,0.1745,0.2717,0.1797,0.1885,0.1966,0.3763
AfricanEl,0.1215,0.0000,0.1613,0.0044,0.3918,0.3940,0.1649,0.1576,0.1937,0.1856,0.1797,0.0832,0.1723,0.1591,0.1495,0.1716,0.1701,0.1996,0.2150,0.1561,0.1576,0.1252,0.2202,0.1753,0.1429,0.1598,0.1583,0.1546,0.1907,0.1672,0.1826,0.1804,0.1524,0.2401,0.0825,0.1944,0.2813,0.1473,0.2835,0.1561,0.1613,0.3756,0.1878,0.2658,0.1620,0.1458,0.1568,0.2459,0.1443,0.1598,0.2541,0.1679,0.1539,0.1878,0.3689
Anteater,0.1701,0.1613,0.0000,0.1627,0.3873,0.3910,0.1642,0.1598,0.1856,0.1892,0.1745,0.1495,0.1804,0.1620,0.1384,0.1694,0.1701,0.1973,0.2526,0.1826,0.1591,0.0987,0.2253,0.1716,0.1539,0.1554,0.1613,0.1502,0.1900,0.1664,0.2224,0.1789,0.1635,0.2658,0.1480,0.1944,0.2798,0.1163,0.2835,0.1620,0.1672,0.3822,0.1885,0.2688,0.1649,0.1392,0.2143,0.2511,0.0898,0.1554,0.2887,0.1723,0.2106,0.1900,0.3645
AsianElep,0.1230,0.0044,0.1627,0.0000,0.3918,0.3932,0.1672,0.1576,0.1951,0.1856,0.1811,0.0832,0.1730,0.1605,0.1510,0.1730,0.1708,0.2010,0.2165,0.1576,0.1568,0.1274,0.2224,0.1753,0.1436,0.1598,0.1583,0.1561,0.1922,0.1686,0.1841,0.1819,0.1539,0.2415,0.0825,0.1951,0.2828,0.1487,0.2865,0.1561,0.1627,0.3763,0.1892,0.2673,0.1613,0.1451,0.1591,0.2467,0.1458,0.1613,0.2541,0.1686,0.1561,0.1900,0.3689
Bandicoot,0.3910,0.3918,0.3873,0.3918,0.0000,0.1996,0.4006,0.3903,0.4183,0.4050,0.4116,0.3814,0.4028,0.3976,0.3814,0.4043,0.4102,0.4050,0.4323,0.4006,0.3881,0.3792,0.4330,0.4131,0.3873,0.3962,0.3932,0.4028,0.4065,0.4065,0.4168,0.4050,0.3954,0.4433,0.3851,0.4124,0.4462,0.3851,0.4624,0.3895,0.3984,0.1797,0.4006,0.4293,0.4006,0.3859,0.4205,0.4345,0.3814,0.4043,0.4580,0.3969,0.4175,0.4050,0.1429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tenrec,0.2717,0.2541,0.2887,0.2541,0.4580,0.4492,0.2813,0.2820,0.3041,0.3078,0.3130,0.2482,0.3027,0.2997,0.2798,0.3012,0.2997,0.3233,0.3299,0.2666,0.2798,0.2732,0.3351,0.2953,0.2732,0.2820,0.2850,0.2850,0.3056,0.2982,0.3041,0.3019,0.2857,0.1325,0.2496,0.3152,0.3645,0.2901,0.3719,0.2842,0.2997,0.4433,0.2990,0.3586,0.2938,0.2739,0.3041,0.3351,0.2872,0.2865,0.0000,0.2975,0.3012,0.3137,0.4286
TombBat,0.1797,0.1679,0.1723,0.1686,0.3969,0.4131,0.1370,0.1576,0.1679,0.1635,0.1348,0.1642,0.1230,0.1186,0.1414,0.1708,0.0928,0.1915,0.2504,0.2128,0.1576,0.1532,0.2054,0.1487,0.1259,0.1532,0.1576,0.1296,0.1753,0.1200,0.2268,0.1097,0.1384,0.2739,0.1613,0.1723,0.2835,0.1686,0.3041,0.1583,0.1384,0.3932,0.1657,0.2739,0.1613,0.1127,0.2246,0.1878,0.1524,0.1348,0.2975,0.0000,0.2209,0.1863,0.3822
TreeHyrax,0.1885,0.1539,0.2106,0.1561,0.4175,0.4161,0.2187,0.2106,0.2393,0.2320,0.2297,0.1436,0.2253,0.2084,0.1900,0.2209,0.2231,0.2452,0.2577,0.2069,0.2106,0.1841,0.2651,0.2305,0.2062,0.2077,0.2106,0.2084,0.2342,0.2180,0.2312,0.2349,0.2172,0.2739,0.1451,0.2430,0.3049,0.2032,0.3211,0.2084,0.2180,0.3976,0.2386,0.2923,0.2165,0.2003,0.0140,0.2968,0.1966,0.2158,0.3012,0.2209,0.0000,0.2489,0.3947
TreeShrew,0.1966,0.1878,0.1900,0.1900,0.4050,0.4065,0.1738,0.1576,0.1937,0.2040,0.1915,0.1848,0.1929,0.1760,0.1568,0.1797,0.1841,0.2010,0.2622,0.2283,0.1583,0.1694,0.2312,0.1826,0.1672,0.1664,0.1598,0.1642,0.2025,0.1885,0.2371,0.2003,0.1723,0.2909,0.1834,0.1966,0.2879,0.1856,0.3071,0.1635,0.1775,0.3837,0.2018,0.2710,0.1657,0.1532,0.2511,0.2541,0.1811,0.1686,0.3137,0.1863,0.2489,0.0000,0.3822


In [5]:
seq_coll = aln.degap()
data = seq_coll.to_dict()

seq_coll_degap = aln_degap.degap()
data_degap = seq_coll_degap.to_dict()

In [6]:
#calculate the ani of data with gaps 
anis_k_2 = pair_ani(data, pair_distance, k=2)
anis_k_5 = pair_ani(data, pair_distance, k=5)
anis_k_8 = pair_ani(data, pair_distance, k=8)
anis_k_10 = pair_ani(data, pair_distance, k=10)
anis_k_15 = pair_ani(data, pair_distance, k=15)
#calculate the ani of degapped data
anis_k_2_degap = pair_ani(data_degap, pair_distance_degap, k=2)
anis_k_5_degap = pair_ani(data_degap, pair_distance_degap, k=5)
anis_k_8_degap = pair_ani(data_degap, pair_distance_degap, k=8)
anis_k_10_degap = pair_ani(data_degap, pair_distance_degap, k=10)

In [7]:
length_diff = pair_len_diff(data, pair_distance)
length_diff_degap = pair_len_diff(data_degap, pair_distance_degap)

## Plot

In [8]:
#get vectors of ani and cogent3 distance for data with gaps
num_seqs = pair_distance.shape[0]
indices = numpy.tril_indices(num_seqs)

anis_k_2 = anis_k_2[indices]
anis_k_5 = anis_k_5[indices]
anis_k_8 = anis_k_8[indices]
anis_k_10 = anis_k_10[indices]
anis_k_15 = anis_k_15[indices]
frac_diff = pair_distance.array[indices] 
length_diff = length_diff[indices]

In [9]:
#get vectors of ani and cogent3 distance for data without gaps
num_seqs_degap = pair_distance_degap.shape[0]
indices_degap = numpy.tril_indices(num_seqs_degap)

anis_k_2_degap = anis_k_2_degap[indices_degap]
anis_k_5_degap = anis_k_5_degap[indices_degap]
anis_k_8_degap = anis_k_8_degap[indices_degap]
anis_k_10_degap = anis_k_10_degap[indices_degap]
frac_diff_degap = pair_distance_degap.array[indices_degap] 
length_diff_degap = length_diff_degap[indices_degap]

In [10]:
#get ready to plot out
error_k_2 = anis_k_2 - frac_diff
error_k_5 = anis_k_5 - frac_diff
error_k_8 = anis_k_8 - frac_diff
error_k_10 = anis_k_10 - frac_diff

In [11]:
fig = make_subplots(rows=4, cols=1,
x_title="absolute difference in length", y_title="ani - frac_diff")

fig.add_trace(go.Scatter(
    x=length_diff,
    y=error_k_2,
    mode='markers',
    name='k = 2'
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=length_diff,
    y=error_k_5,
    mode='markers',
    name='k = 5'
), row=2, col=1)

fig.add_trace(go.Scatter(
    x=length_diff,
    y=error_k_8, 
    mode='markers',
    name='k = 8'
), row=3, col=1)

fig.add_trace(go.Scatter(
    x=length_diff,
    y=error_k_10, 
    mode='markers',
    name='k = 10'
), row=4, col=1)

fig.update_layout(height=1000, width=1000, title_text="Comparison between ANI and fractional difference")
fig.show()

In [12]:
fig = make_subplots(rows=4, cols=1,
x_title="fractional difference", y_title="ani")

fig.add_trace(go.Scatter(
    x=frac_diff,
    y=anis_k_2,
    mode='markers',
    name='k = 2'
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=frac_diff,
    y=anis_k_5,
    mode='markers',
    name='k = 5'
), row=2, col=1)

fig.add_trace(go.Scatter(
    x=frac_diff,
    y=anis_k_8, 
    mode='markers',
    name='k = 8'
), row=3, col=1)

fig.add_trace(go.Scatter(
    x=frac_diff,
    y=anis_k_10, 
    mode='markers',
    name='k = 10'
), row=4, col=1)

fig.update_layout(height=1000, width=1000, title_text="ANI v.s. fractional difference <br>for data with gaps")
fig.show()

In [13]:
fig = make_subplots(rows=4, cols=1,
x_title="fractional difference", y_title="ani")

fig.add_trace(go.Scatter(
    x=frac_diff_degap,
    y=anis_k_2_degap,
    mode='markers',
    name='k = 2'
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=frac_diff_degap,
    y=anis_k_5_degap,
    mode='markers',
    name='k = 5'
), row=2, col=1)

fig.add_trace(go.Scatter(
    x=frac_diff_degap,
    y=anis_k_8_degap, 
    mode='markers',
    name='k = 8'
), row=3, col=1)

fig.add_trace(go.Scatter(
    x=frac_diff_degap,
    y=anis_k_10_degap, 
    mode='markers',
    name='k = 10'
), row=4, col=1)

fig.update_layout(height=1000, width=1000, title_text="ANI v.s. fractional difference <br>data without gaps")
fig.show()

In [14]:
fig = make_subplots(rows=5, cols=1,
x_title="absolute difference in length", y_title="ani")

fig.add_trace(go.Scatter(
    x=length_diff,
    y=anis_k_2,
    mode='markers',
    name='k = 2'
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=length_diff,
    y=anis_k_5,
    mode='markers',
    name='k = 5'
), row=2, col=1)

fig.add_trace(go.Scatter(
    x=length_diff,
    y=anis_k_8, 
    mode='markers',
    name='k = 8'
), row=3, col=1)

fig.add_trace(go.Scatter(
    x=length_diff,
    y=anis_k_10, 
    mode='markers',
    name='k = 10'
), row=4, col=1)

fig.add_trace(go.Scatter(
    x=length_diff,
    y=anis_k_15, 
    mode='markers',
    name='k = 15'
), row=5, col=1)

fig.update_layout(height=1000, width=1000, title_text="ani v.s. absolute difference in length")
fig.show()