# 取り組み

Scipy.spatialの`distance.cosine`を除いてみたところ、すでに1-cosineをされていたことに気がついた。  
現状は`1-distance.cosine`をしており、これでは距離ではなくて類似度を求めている。  
適切な距離を定義したい。

In [1]:
# ルートディレクトリをPathに含めるおまじない
import sys, os
from pathlib import Path
if os.getcwd() != "/mnt/d/Research/DAJIN2":
    parent_path = str(Path(os.path.dirname(os.path.abspath("__file__"))).parent.parent)
    sys.path.append(parent_path)
    os.chdir(parent_path)
print(os.getcwd())

/mnt/d/Research/DAJIN2


# 実験

In [2]:
from scipy import stats
from scipy.spatial import distance


In [10]:
samp = [1,1,100,1,1]
cont = [1,1,1,1,1]

In [11]:
print(distance.cosine(samp, cont))

0.5349908532110781


In [12]:
print(distance.correlation(samp, cont))

nan


In [13]:
print(distance.euclidean(samp, cont))

99.0


In [14]:
samp = [1,1,2,1,1]
cont = [1,1,1,1,1]

In [15]:
print(distance.euclidean(samp, cont))

1.0


In [19]:
samp = [0.9, 0.9, 0.9, 0.9, 0.9]
cont = [0.1 ,0.1 ,0.1 ,0.1 ,0.1]
print(distance.euclidean(samp, cont))
print(distance.cosine(samp, cont))

1.788854381999832
0


In [20]:
samp = [0.1, 0.1, 0.9, 0.1, 0.1]
cont = [0.1 ,0.1 ,0.1 ,0.1 ,0.1]
print(distance.euclidean(samp, cont))
print(distance.cosine(samp, cont))

0.8
0.3694073749055342


In [None]:
samp = [0.9, 0.9, 0.9, 0.9, 0.9]
cont = [0.1 ,0.1 ,0.1 ,0.1 ,0.1 ]
print(distance.euclidean(samp, cont))

1.788854381999832


## アルビノ点変異(1%)のwindow countingで試行錯誤する

In [22]:
from __future__ import annotations

import sys, os
from pathlib import Path

sys.path.append("/mnt/d/Research/DAJIN2")
os.chdir("/mnt/d/Research/DAJIN2")

import hashlib
from collections import defaultdict
from pathlib import Path
from importlib import reload

from src.DAJIN2.core import preprocess, classification, clustering, consensus, report
from src.DAJIN2.core.clustering import clustering

##### # * Subset of Point mutation
##### # 50 or 10 or 01%
percent = "01"
SAMPLE, CONTROL, ALLELE, NAME, GENOME, DEBUG, THREADS = (
    f"misc/data/tyr_albino_{percent}%.fq.gz",
    "misc/data/tyr_control.fq.gz",
    "misc/data/tyr_control.fasta",
    f"test-tyr-albino-{percent}%",
    "mm10",
    True,
    14,
)
print(f"processing {NAME}...")

##########################################################
# Check inputs
##########################################################
preprocess.check_inputs.check_files(SAMPLE, CONTROL, ALLELE)
TEMPDIR = Path("DAJINResults", ".tempdir", NAME)
IS_CACHE_CONTROL = preprocess.check_inputs.exists_cached_control(CONTROL, TEMPDIR)
IS_CACHE_GENOME = preprocess.check_inputs.exists_cached_genome(GENOME, TEMPDIR, IS_CACHE_CONTROL)
UCSC_URL, GOLDENPATH_URL = None, None
if GENOME and not IS_CACHE_GENOME:
    UCSC_URL, GOLDENPATH_URL = preprocess.check_inputs.check_and_fetch_genome(GENOME)

##########################################################
# Format inputs
##########################################################
SAMPLE_NAME = preprocess.format_inputs.extract_basename(SAMPLE)
CONTROL_NAME = preprocess.format_inputs.extract_basename(CONTROL)
FASTA_ALLELES = preprocess.format_inputs.dictionize_allele(ALLELE)

preprocess.format_inputs.make_directories(TEMPDIR, SAMPLE_NAME, CONTROL_NAME)

if GENOME:
    GENOME_COODINATES = preprocess.format_inputs.fetch_coodinate(GENOME, UCSC_URL, FASTA_ALLELES["control"])
    CHROME_SIZE = preprocess.format_inputs.fetch_chrom_size(GENOME_COODINATES["chr"], GENOME, GOLDENPATH_URL)
    preprocess.format_inputs.cache_coodinates_and_chromsize(TEMPDIR, GENOME, GENOME_COODINATES, CHROME_SIZE)

processing test-tyr-albino-01%...


In [25]:
################################################################################
# Export fasta files as single-FASTA format
################################################################################
# TODO: use yeild, not export
for identifier, sequence in FASTA_ALLELES.items():
    contents = "\n".join([">" + identifier, sequence]) + "\n"
    output_fasta = Path(TEMPDIR, "fasta", f"{identifier}.fasta")
    output_fasta.write_text(contents)
###############################################################################
# Mapping with mappy
###############################################################################
for path_fasta in Path(TEMPDIR, "fasta").glob("*.fasta"):
    name_fasta = path_fasta.stem
    preprocess.mappy_align.output_sam(TEMPDIR, path_fasta, name_fasta, CONTROL, CONTROL_NAME, threads=THREADS)
    preprocess.mappy_align.output_sam(TEMPDIR, path_fasta, name_fasta, SAMPLE, SAMPLE_NAME, threads=THREADS)
    preprocess.mappy_align.output_sam(
        TEMPDIR, path_fasta, name_fasta, CONTROL, CONTROL_NAME, preset="splice", threads=THREADS
    )
    preprocess.mappy_align.output_sam(
        TEMPDIR, path_fasta, name_fasta, SAMPLE, SAMPLE_NAME, preset="splice", threads=THREADS
    )
########################################################################
# MIDSV conversion
########################################################################
for path_sam in Path(TEMPDIR, "sam").glob(f"{CONTROL_NAME}_splice_*"):
    preprocess.calc_midsv.output_midsv(TEMPDIR, path_sam)
for path_sam in Path(TEMPDIR, "sam").glob(f"{SAMPLE_NAME}_splice_*"):
    preprocess.calc_midsv.output_midsv(TEMPDIR, path_sam)


In [26]:
###############################################################################
# Correct CSSPLITS
###############################################################################
preprocess.correct_revititive_deletions.execute(TEMPDIR, FASTA_ALLELES, CONTROL_NAME, SAMPLE_NAME)

In [27]:
## correct_sequence_error.py
from __future__ import annotations

import random
import re
from collections import Counter, defaultdict
from copy import deepcopy
from pathlib import Path
import midsv
from scipy import stats
from scipy.spatial import distance


def set_indexes(sequence: str):
    sequence_length = len(sequence)
    num_subset = sequence_length % 5
    left_idx = 0
    right_idx = sequence_length
    if num_subset == 1:
        left_idx += 1
    elif num_subset == 2:
        left_idx += 1
        right_idx -= 1
    elif num_subset == 3:
        left_idx += 2
        right_idx -= 1
    elif num_subset == 4:
        left_idx += 2
        right_idx -= 2
    return left_idx, right_idx


def count_indels_5mer(cssplits: list[list[str]], left_idx: int, right_idx: int) -> list[dict]:
    transposed = [list(t) for t in zip(*cssplits)]
    count_indels_5mer = []
    for i in range(left_idx, right_idx, 5):
        count = {"ins": [1] * 5, "del": [1] * 5, "sub": [1] * 5}
        cssplits_5mer = transposed[i : i + 5]
        for j, cs in enumerate(cssplits_5mer):
            counter = Counter(cs)
            for key, cnt in counter.items():
                if key.startswith("=") or key == "N" or re.search(r"a|c|g|t|n", key):
                    continue
                if key.startswith("+"):
                    count["ins"][j] += cnt
                elif key.startswith("-"):
                    count["del"][j] += cnt
                elif key.startswith("*"):
                    count["sub"][j] += cnt
        count_indels_5mer.append(count)
    return count_indels_5mer


def extract_sequence_errors(count_5mer_sample, count_5mer_control, coverage_sample, coverage_control):
    sequence_errors = [set() for _ in range(len(count_5mer_sample))]
    for i in range(len(sequence_errors)):
        for mutation in ["ins", "del", "sub"]:
            samp = [c / coverage_sample for c in count_5mer_sample[i][mutation]]
            cont = [c / coverage_control for c in count_5mer_control[i][mutation]]
            dist = 1 - distance.cosine(samp, cont)
            _, pvalue = stats.ttest_ind(samp, cont, equal_var=False)
            if dist > 0.9 and pvalue > 0.05:
                sequence_errors[i].add(mutation)
    return sequence_errors


def replace_errors_to_atmark(cssplits_sample, sequence_errors, left_idx, right_idx):
    cssplits_replaced = []
    for samp in cssplits_sample:
        samp_replaced = deepcopy(samp)
        for idx_error, idx_5mer in enumerate(range(left_idx, right_idx, 5)):
            samp_5mer = samp[idx_5mer : idx_5mer + 5]
            error = sequence_errors[idx_error]
            if "ins" in error:
                samp_5mer = ["@" if cs.startswith("+") else cs for cs in samp_5mer]
            if "del" in error:
                samp_5mer = ["@" if cs.startswith("-") else cs for cs in samp_5mer]
            if "sub" in error:
                samp_5mer = ["@" if cs.startswith("*") else cs for cs in samp_5mer]
            samp_replaced[idx_5mer : idx_5mer + 5] = samp_5mer
        cssplits_replaced.append(samp_replaced)
    return cssplits_replaced


def replace_atmark(cssplits: list[list[str]], sequence: str) -> list[list[str]]:
    random.seed(1)
    cssplits_replaced = deepcopy(cssplits)
    sequence_length = len(sequence)
    for i in range(1, sequence_length - 1):
        cssplits_atmark = defaultdict(str)
        cssplits_sampling_key = defaultdict(list)
        cssplits_sampling_all = []
        flag_all_atmark = True
        for idx, cssplit in enumerate(cssplits):
            key = ",".join([cssplit[i - 1], cssplit[i + 1]])
            if cssplit[i] == "@":
                cssplits_atmark[idx] = key
            else:
                cssplits_sampling_key[key].append(cssplit[i])
                cssplits_sampling_all.append(cssplit[i])
                flag_all_atmark = False
        for idx, key in cssplits_atmark.items():
            if flag_all_atmark:
                cssplits_replaced[idx][i] = "N"
            elif cssplits_sampling_key[key]:
                cssplits_replaced[idx][i] = random.choice(cssplits_sampling_key[key])
            else:
                cssplits_replaced[idx][i] = random.choice(cssplits_sampling_all)
    for cs in cssplits_replaced:
        if cs[0] == "@":
            cs[0] = "=" + sequence[0]
        if cs[-1] == "@":
            cs[-1] = "=" + sequence[-1]
    return cssplits_replaced


###############################################################################
# main
###############################################################################


def execute(TEMPDIR: Path, FASTA_ALLELES: dict[str, str], CONTROL_NAME: str, SAMPLE_NAME: str) -> None:
    for allele, sequence in FASTA_ALLELES.items():
        midsv_sample = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{SAMPLE_NAME}_splice_{allele}.jsonl")))
        midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
        coverage_sample = len(midsv_sample)
        coverage_control = len(midsv_control)
        cssplits_sample = [cs["CSSPLIT"].split(",") for cs in midsv_sample]
        cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]
        left_idx, right_idx = set_indexes(sequence)
        count_5mer_sample = count_indels_5mer(cssplits_sample, left_idx, right_idx)
        count_5mer_control = count_indels_5mer(cssplits_control, left_idx, right_idx)
        sequence_errors = extract_sequence_errors(
            count_5mer_sample, count_5mer_control, coverage_sample, coverage_control
        )
        cssplits_sample_error_replaced = replace_errors_to_atmark(cssplits_sample, sequence_errors, left_idx, right_idx)
        cssplits_control_error_replaced = replace_errors_to_atmark(
            cssplits_control, sequence_errors, left_idx, right_idx
        )
        cssplits_sample_atmark_replaced = replace_atmark(cssplits_sample_error_replaced, sequence)
        cssplits_control_atmark_replaced = replace_atmark(cssplits_control_error_replaced, sequence)
        # Replace CSSPLIT
        cssplits_sample_corrected = [",".join(cs) for cs in cssplits_sample_atmark_replaced]
        cssplits_control_corrected = [",".join(cs) for cs in cssplits_control_atmark_replaced]
        for i, cssplits in enumerate(cssplits_sample_corrected):
            midsv_sample[i]["CSSPLIT"] = cssplits
        for i, cssplits in enumerate(cssplits_control_corrected):
            midsv_control[i]["CSSPLIT"] = cssplits
        midsv.write_jsonl(midsv_control, Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl"))
        midsv.write_jsonl(midsv_sample, Path(TEMPDIR, "midsv", f"{SAMPLE_NAME}_splice_{allele}.jsonl"))


In [28]:
allele = "control"
sequence = FASTA_ALLELES[allele]

In [29]:
midsv_sample = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{SAMPLE_NAME}_splice_{allele}.jsonl")))
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
coverage_sample = len(midsv_sample)
coverage_control = len(midsv_control)
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in midsv_sample]
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]
left_idx, right_idx = set_indexes(sequence)
count_5mer_sample = count_indels_5mer(cssplits_sample, left_idx, right_idx)
count_5mer_control = count_indels_5mer(cssplits_control, left_idx, right_idx)
sequence_errors = extract_sequence_errors(
    count_5mer_sample, count_5mer_control, coverage_sample, coverage_control
)


In [31]:
from collections import defaultdict
from collections import Counter

count = defaultdict(int)
for cs in cssplits_sample:
    count[cs[828]] += 1
print(count)

count = defaultdict(int)
for cs in cssplits_control:
    count[cs[828]] += 1
print(count)

defaultdict(<class 'int'>, {'=G': 9116, 'N': 3, '*GT': 80, '=c': 1, '-G': 1, '+A|=G': 1})
defaultdict(<class 'int'>, {'=G': 9208, 'N': 3, '=c': 1, '-G': 1, '+A|=G': 1})


In [32]:
i_pm = 828
print(count_5mer_sample[i_pm//5])
print(count_5mer_control[i_pm//5])

{'ins': [3, 1, 1, 2, 2], 'del': [1, 1, 2, 2, 1], 'sub': [1, 1, 1, 81, 1]}
{'ins': [3, 1, 1, 2, 2], 'del': [1, 1, 2, 2, 1], 'sub': [1, 1, 1, 1, 1]}


In [38]:
samp = [1, 1, 1, 81, 1]
cont = [1,1,1,1,1]
print(distance.euclidean(samp, cont))
print(1 - distance.cosine(samp, cont))

80.0
0.4691552259617494


In [37]:
samp = [1, 1, 1, 81, 1]
cont = [1,1,1,5,1]
print(distance.euclidean(samp, cont))
print(1 - distance.cosine(samp, cont))

76.0
0.9373611373027213


### Cosine類似度だけだとControlがほんの少しの変化しただけで大きく類似度が変化してしまう

# 結果


+ [x] Albino点変異の1%

無事に1%のアレルを補足できていることがわかった

# 次回の課題

+ [x] ~~点変異、欠失、ノックインのデザインで動くかを確認する~~
+ [x] ~~Clusteringのときの警告メッセージは無害なので消去する~~
+ [ ] right_loxpがいまいちな理由を考察する
    +  ~~[x] Ayabe-task1のpreprocessにおいて、329と1280の補正がされていない理由を検討する~~
+ [x] ~~distanceの閾値を下げた状態で、1%点変異が検出できるか~~
+ [ ] Insertionのなかにある変異を同定する手法を考案する
+ [ ] `preprocess.correct_sequence_error.replace_atmark`のコードがわかりにくい
    + テストを用意してリファクタリングする
+ [ ] cis変異の両端が欠失している場合に、Nで置き換えるとtransとなってしまうのをどうするか（`replace_n`）
+ [ ] 短いリードの扱いをどうするべきか