# Exploring kmer features from GISAID lineages

The features used in this notebook were generated by running
```sh
python featurize_gisaid.py  # takes about 5 minutes
```

In [None]:
import torch
import matplotlib.pyplot as plt
from pyrocov import pangolin
import matplotlib
matplotlib.rcParams['figure.dpi'] = 200

In [None]:
data = torch.load("results/gisaid.kmer_counts.pt")
print(list(data.keys()))
locals().update(data)
print(kmer_counts.shape)

Merge lineages with too few observations

In [None]:
lineage_counts_dict = {k: int(lineage_counts[v]) for k, v in lineage_ids.items()}
mapping = pangolin.merge_lineages(lineage_counts_dict, min_count=100)
lineages = sorted(set(mapping.get(k, k) for k in lineage_ids))
lineage_ids = {k: i for i, k in enumerate(lineages)}
new_n = len(lineages)
old_n, p = kmer_counts.shape
lineage_counts = lineage_counts.new_zeros(new_n)
kmer_counts = kmer_counts.new_zeros(new_n, p)
for old_lineage, old_i in data["lineage_ids"].items():
    new_lineage = mapping.get(old_lineage, old_lineage)
    new_i = lineage_ids[new_lineage]
    lineage_counts[new_i] += data["lineage_counts"][old_i]
    kmer_counts[new_i] += data["kmer_counts"][old_i]
assert lineage_counts.sum() == data["lineage_counts"].sum()
print(int(lineage_counts.sum()))
print(kmer_counts.shape)

Quantize features to `{-1,0,1}` where `-1` means absent, `1` means present, and `0` means unknown.

In [None]:
lb, ub = 0.2, 0.8  # ad hoc thresholds
features = kmer_counts / lineage_counts[:, None]
features = (features > ub).float() - (features < lb).float()  # in {-1, 0, 1}

Drop constant features

In [None]:
mask = features.eq(-1).any(0) & features.eq(1).any(0)
features = features[:, mask]
print(features.shape)

Drop ambiguous features

In [None]:
mask = features.abs().mean(0) > 0.8
features = features[:, mask]
print(features.shape)

Deduplicate features

In [None]:
unique = sorted(set(map(tuple, features.T.long().tolist())), key=sum)
features = torch.tensor(list(map(list, unique)), dtype=torch.float).T.contiguous()
features.shape

In [None]:
plt.figure().patch.set_visible(False)
plt.imshow(features.numpy(), cmap='bwr', aspect='auto')
plt.xticks(())
plt.yticks(())
plt.axis("off")
plt.xlabel(f"{features.size(0)} pango lineages")
plt.ylabel(f"{features.size(1)} kmer features")
plt.tight_layout(pad=0)