# 今回の取り組み

下記の2つについて、`correct_sequence_error`などで補正するときに、**変異候補の塩基配列のみを対象とする**ことで対応できる可能性がある

+ [ ] cis変異の両端が欠失している場合に、Nで置き換えるとtransとなってしまうのをどうするか（`replace_n`）
+ [ ] 短いリードの扱いをどうするべきか

- 変異候補の塩基配列のみを対象とする
    - 両端が欠失しているようなリードについて、変異候補部位を含まないリードは`uncategorized`といったカテゴリにできる
    - よって**変異候補部位を含むか含まないか**を考えることで、短いリードや両端が欠失しているリードの分類が可能になる？

## いつものセットアップ

In [1]:
# ルートディレクトリをPathに含めるおまじない
import sys, os
from pathlib import Path
if Path(os.getcwd()).stem != "DAJIN2":
    parent_path = str(Path(os.path.dirname(os.path.abspath("__file__"))).parent.parent)
    sys.path.append(parent_path)
    os.chdir(parent_path)
print(os.getcwd())

/mnt/c/Users/akihi/Documents/GitHub/DAJIN2


In [2]:
%%bash
# pipの更新
pip install -q -U pip
pip install -q -U -r requirements.txt

# 実験

- `preprocess.correct_sequence_error`と`preprocess.correct_knockin`でシークエンスエラーではない変異箇所を抽出する
- もしくは、コードを独立させたほうが良いかもしれません
    - correctionが終わったあとに、sampleとcontrolと比べて変異のある塩基位置を抽出する
    - correctionと独立させることで、correctionの方法が変わっても塩基位置を抽出するコードを変える必要がなくなる

In [3]:
from __future__ import annotations

import sys, os
from pathlib import Path

import hashlib
from collections import defaultdict
from pathlib import Path
from importlib import reload

from src.DAJIN2.core import preprocess, classification, clustering, consensus, report
from src.DAJIN2.core.clustering import clustering

reload(preprocess)
reload(classification)
reload(clustering)
reload(consensus)
reload(report)


##### # * Point mutation
SAMPLE, CONTROL, ALLELE, NAME, GENOME, DEBUG, THREADS = (
    "examples/pm-tyr/barcode31.fq.gz",
    "examples/pm-tyr/barcode32.fq.gz",
    "examples/pm-tyr/design_tyr.fa",
    "test-pm-tyr",
    "mm10",
    True,
    14,
)

print(f"processing {NAME}...")

##########################################################
# Check inputs
##########################################################
preprocess.check_inputs.check_files(SAMPLE, CONTROL, ALLELE)
TEMPDIR = Path("DAJINResults", ".tempdir", NAME)
IS_CACHE_CONTROL = preprocess.check_inputs.exists_cached_control(CONTROL, TEMPDIR)
IS_CACHE_GENOME = preprocess.check_inputs.exists_cached_genome(GENOME, TEMPDIR, IS_CACHE_CONTROL)
UCSC_URL, GOLDENPATH_URL = None, None
if GENOME and not IS_CACHE_GENOME:
    UCSC_URL, GOLDENPATH_URL = preprocess.check_inputs.check_and_fetch_genome(GENOME)

##########################################################
# Format inputs
##########################################################
SAMPLE_NAME = preprocess.format_inputs.extract_basename(SAMPLE)
CONTROL_NAME = preprocess.format_inputs.extract_basename(CONTROL)
FASTA_ALLELES = preprocess.format_inputs.dictionize_allele(ALLELE)
THREADS = min(THREADS, os.cpu_count()-1)

preprocess.format_inputs.make_directories(TEMPDIR, SAMPLE_NAME, CONTROL_NAME)

if GENOME:
    GENOME_COODINATES = preprocess.format_inputs.fetch_coodinate(GENOME, UCSC_URL, FASTA_ALLELES["control"])
    CHROME_SIZE = preprocess.format_inputs.fetch_chrom_size(GENOME_COODINATES["chr"], GENOME, GOLDENPATH_URL)
    preprocess.format_inputs.cache_coodinates_and_chromsize(TEMPDIR, GENOME, GENOME_COODINATES, CHROME_SIZE)


processing test-pm-tyr...


In [6]:
import midsv
from collections import Counter

from __future__ import annotations

import random
import re
from collections import Counter, defaultdict
from copy import deepcopy
from pathlib import Path
import midsv
import numpy as np
from scipy import stats
from scipy.spatial import distance


def set_indexes(sequence: str):
    sequence_length = len(sequence)
    num_subset = sequence_length % 5
    left_idx = 0
    right_idx = sequence_length
    if num_subset == 1:
        left_idx += 1
    elif num_subset == 2:
        left_idx += 1
        right_idx -= 1
    elif num_subset == 3:
        left_idx += 2
        right_idx -= 1
    elif num_subset == 4:
        left_idx += 2
        right_idx -= 2
    return left_idx, right_idx


def count_indels_5mer(cssplits: list[list[str]], left_idx: int, right_idx: int) -> list[dict]:
    transposed = [list(t) for t in zip(*cssplits)]
    count_indels_5mer = []
    for i in range(left_idx, right_idx, 5):
        count = {"ins": [1] * 5, "del": [1] * 5, "sub": [1] * 5}
        cssplits_5mer = transposed[i : i + 5]
        for j, cs in enumerate(cssplits_5mer):
            counter = Counter(cs)
            for key, cnt in counter.items():
                if key.startswith("=") or key == "N" or re.search(r"a|c|g|t|n", key):
                    continue
                if key.startswith("+"):
                    count["ins"][j] += cnt
                elif key.startswith("-"):
                    count["del"][j] += cnt
                elif key.startswith("*"):
                    count["sub"][j] += cnt
        count_indels_5mer.append(count)
    return count_indels_5mer


def extract_sequence_errors(count_5mer_sample, count_5mer_control):
    sequence_errors = [set() for _ in range(len(count_5mer_sample))]
    dists = defaultdict(list)
    # Calculate Jensen-Shannon distance
    for samp, cont in zip(count_5mer_sample, count_5mer_control):
        for mutation in ["ins", "del", "sub"]:
            s = samp[mutation]
            c = cont[mutation]
            dists[mutation].append(distance.jensenshannon(s, c))
    # Discrimitate seq errors and real mutation using Hotelling's T-squared distribution
    dists_all = np.array(list(dists.values())).flatten()
    avg = np.average(dists_all[~np.isnan(dists_all)])
    var = np.var(dists_all[~np.isnan(dists_all)])
    threshold = 0.05
    for mutation in ["ins", "del", "sub"]:
        dists_subset = dists[mutation]
        scores = [(xi - avg) ** 2 / var for xi in dists_subset]
        thres = stats.chi2.interval(1 - threshold, 1)[1]
        for i, score in enumerate(scores):
            # 'nan' means the two distributions have too different, so it could be a real mutation
            if np.isnan(score):
                continue
            if score < thres:
                sequence_errors[i].add(mutation)
    return sequence_errors


def replace_errors_to_atmark(cssplits_sample, sequence_errors, left_idx, right_idx):
    cssplits_replaced = []
    for samp in cssplits_sample:
        samp_replaced = deepcopy(samp)
        for idx_error, idx_5mer in enumerate(range(left_idx, right_idx, 5)):
            samp_5mer = samp[idx_5mer : idx_5mer + 5]
            error = sequence_errors[idx_error]
            if "ins" in error:
                samp_5mer = ["@" if cs.startswith("+") else cs for cs in samp_5mer]
            if "del" in error:
                samp_5mer = ["@" if cs.startswith("-") else cs for cs in samp_5mer]
            if "sub" in error:
                samp_5mer = ["@" if cs.startswith("*") else cs for cs in samp_5mer]
            samp_replaced[idx_5mer : idx_5mer + 5] = samp_5mer
        cssplits_replaced.append(samp_replaced)
    return cssplits_replaced


def replace_atmark(cssplits: list[list[str]], sequence: str) -> list[list[str]]:
    random.seed(1)
    cssplits_replaced = deepcopy(cssplits)
    sequence_length = len(sequence)
    for i in range(1, sequence_length - 1):
        cssplits_atmark = defaultdict(str)
        cssplits_sampling_key = defaultdict(list)
        cssplits_sampling_all = []
        flag_all_atmark = True
        for idx, cssplit in enumerate(cssplits):
            key = ",".join([cssplit[i - 1], cssplit[i + 1]])
            if cssplit[i] == "@":
                cssplits_atmark[idx] = key
            else:
                cssplits_sampling_key[key].append(cssplit[i])
                cssplits_sampling_all.append(cssplit[i])
                flag_all_atmark = False
        for idx, key in cssplits_atmark.items():
            if flag_all_atmark:
                cssplits_replaced[idx][i] = "N"
            elif cssplits_sampling_key[key]:
                cssplits_replaced[idx][i] = random.choice(cssplits_sampling_key[key])
            else:
                cssplits_replaced[idx][i] = random.choice(cssplits_sampling_all)
    for cs in cssplits_replaced:
        if cs[0] == "@":
            cs[0] = "=" + sequence[0]
        if cs[-1] == "@":
            cs[-1] = "=" + sequence[-1]
    return cssplits_replaced

def transpose(cssplits):
    return [list(cs) for cs in zip(*cssplits)]


def call_count(transpose_cssplits: list[list[str]]) -> list[dict[str:int]]:
    cssplit_counts = []
    for cssplit in transpose_cssplits:
        count = Counter(cssplit)
        count = dict(count)
        cssplit_counts.append(count)
    return cssplit_counts


In [12]:
allele="control"

In [13]:
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]
# Sample
midsv_sample = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{SAMPLE_NAME}_splice_{allele}.jsonl")))
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in midsv_sample]
transpose_control = transpose(cssplits_control)
transpose_sample = transpose(cssplits_sample)
# Make count matrix
count_control = call_count(transpose_control)
count_sample = call_count(transpose_sample)

In [14]:
count_control[0]

{'=T': 800, 'N': 200}

In [15]:
count_sample[0]

{'=T': 910, 'N': 89}

In [28]:
# Extract sequence errors
sequence = FASTA_ALLELES[allele]
left_idx, right_idx = set_indexes(sequence)
count_5mer_sample = count_indels_5mer(cssplits_sample, left_idx, right_idx)
count_5mer_control = count_indels_5mer(cssplits_control, left_idx, right_idx)
sequence_errors = extract_sequence_errors(count_5mer_sample, count_5mer_control)

In [29]:
for i,s in enumerate(sequence_errors):
    if len(s) != 3:
        print(i, s)

0 {'sub', 'del'}
32 {'ins', 'del'}
37 {'sub', 'del'}
43 {'sub', 'del'}
49 {'sub', 'del'}
65 {'sub', 'del'}
67 {'sub', 'del'}
73 {'sub', 'ins'}
95 {'ins', 'del'}
96 {'sub', 'del'}
108 {'sub', 'del'}
120 {'sub', 'del'}
125 {'sub', 'del'}
130 {'sub', 'del'}
165 {'ins'}
166 {'ins', 'del'}
174 {'sub', 'ins'}
188 {'sub', 'del'}
205 {'sub', 'ins'}
215 {'sub', 'ins'}
219 {'sub', 'del'}
251 {'sub', 'del'}
259 {'sub', 'del'}
300 {'sub', 'del'}
318 {'ins', 'del'}
319 {'sub'}
349 {'ins', 'del'}
358 {'sub', 'del'}
393 {'ins', 'del'}
395 {'sub', 'del'}
407 {'sub', 'del'}
419 {'sub', 'del'}
424 {'ins', 'del'}
447 {'sub', 'ins'}
463 {'ins', 'del'}
514 {'sub', 'del'}
537 {'sub', 'ins'}
538 {'sub', 'del'}
542 {'sub', 'del'}
548 {'ins', 'del'}
566 {'sub', 'del'}


妙に`{'sub', 'del'}`が多い気がします…？

### 脱線：`remove_minor_indels`の実装
- 5merの中で、indelの数が少ないものを除外する

In [24]:
sequence_errors = [set() for _ in range(len(count_5mer_sample))]
dists = defaultdict(list)
# Calculate Jensen-Shannon distance
for samp, cont in zip(count_5mer_sample, count_5mer_control):
    for mutation in ["ins", "del", "sub"]:
        s = samp[mutation]
        c = cont[mutation]
        dists[mutation].append(distance.jensenshannon(s, c))


In [34]:
sum(count_5mer_sample[0]["ins"])

10

In [35]:
i = 828
count_5mer_sample[i//5]["sub"]

[7, 9, 18, 816, 33]

In [39]:
int(len(cssplits_sample) * 0.01)

9

In [67]:
def remove_minority_5mer(cssplits, count_5mer: list[dict]) -> list[dict]:
    coverage = len(cssplits)
    count_5mer_filtered = []
    for count in count_5mer:
        dict_mutation = defaultdict(list)
        for mutation in ["ins", "del", "sub"]:
            if all(True for c in count[mutation] if c < coverage*0.01):
                count[mutation] = [1] * 5
            dict_mutation[mutation] = count[mutation]
        count_5mer_filtered.append(dict_mutation)
    return count_5mer_filtered

In [68]:
from copy import deepcopy
cssplits = deepcopy(cssplits_sample)
print(len(cssplits)*0.01)
count_5mer = deepcopy(count_5mer_sample)

9.99


In [69]:
x = remove_minority_5mer(cssplits, count_5mer)

In [70]:
x[0]

defaultdict(list,
            {'ins': [1, 1, 1, 1, 1],
             'del': [1, 1, 1, 1, 1],
             'sub': [1, 1, 1, 1, 1]})

In [84]:
def remove_minor_indels(cssplits: list[list[str]], count_5mer: list[dict]) -> list[dict]:
    coverage = len(cssplits)
    count_5mer_filtered = []
    for count in count_5mer:
        dict_mutation = defaultdict(list)
        for mutation in ["ins", "del", "sub"]:
            if all(True if c < coverage*0.01 else False for c in count[mutation]):
                count[mutation] = [1] * 5
            dict_mutation[mutation] = count[mutation]
        count_5mer_filtered.append(dict_mutation)
    return count_5mer_filtered


def count_5mer_indels(cssplits: list[list[str]], left_idx: int, right_idx: int) -> list[dict]:
    transposed = [list(t) for t in zip(*cssplits)]
    count_5mer = []
    for i in range(left_idx, right_idx, 5):
        count = {"ins": [1] * 5, "del": [1] * 5, "sub": [1] * 5}
        cssplits_5mer = transposed[i : i + 5]
        for j, cs in enumerate(cssplits_5mer):
            counter = Counter(cs)
            for key, cnt in counter.items():
                if key.startswith("=") or key == "N" or re.search(r"a|c|g|t|n", key):
                    continue
                if key.startswith("+"):
                    count["ins"][j] += cnt
                elif key.startswith("-"):
                    count["del"][j] += cnt
                elif key.startswith("*"):
                    count["sub"][j] += cnt
        count_5mer.append(count)
        count_5mer = remove_minor_indels(cssplits, count_5mer)
    return count_5mer



In [85]:
# Extract sequence errors
sequence = FASTA_ALLELES[allele]
left_idx, right_idx = set_indexes(sequence)
count_5mer_sample = count_5mer_indels(cssplits_sample, left_idx, right_idx)
count_5mer_control = count_5mer_indels(cssplits_control, left_idx, right_idx)
sequence_errors = extract_sequence_errors(count_5mer_sample, count_5mer_control)

In [87]:
for i,s in enumerate(sequence_errors):
    if len(s) != 3:
        print(i, s)

37 {'sub', 'del'}
43 {'sub', 'del'}
49 {'sub', 'del'}
65 {'sub', 'del'}
67 {'sub', 'del'}
73 {'sub', 'ins'}
95 {'ins', 'del'}
96 {'sub', 'del'}
108 {'sub', 'del'}
125 {'sub', 'del'}
165 {'ins'}
166 {'ins', 'del'}
174 {'sub', 'ins'}
188 {'sub', 'del'}
205 {'sub', 'ins'}
215 {'sub', 'ins'}
219 {'sub', 'del'}
251 {'sub', 'del'}
259 {'sub', 'del'}
300 {'sub', 'del'}
318 {'ins', 'del'}
319 {'sub', 'ins'}
349 {'ins', 'del'}
358 {'sub', 'del'}
393 {'ins', 'del'}
395 {'sub', 'del'}
407 {'sub', 'del'}
419 {'sub', 'del'}
424 {'ins', 'del'}
447 {'sub', 'ins'}
463 {'ins', 'del'}
514 {'sub', 'del'}
537 {'sub', 'ins'}
538 {'sub', 'del'}
542 {'sub', 'del'}
548 {'ins', 'del'}
566 {'sub', 'del'}


In [92]:
print(count_5mer_sample[165])
print(count_5mer_control[165])

defaultdict(<class 'list'>, {'ins': [1, 1, 1, 1, 1], 'del': [10, 16, 17, 131, 11], 'sub': [7, 9, 18, 816, 33]})
defaultdict(<class 'list'>, {'ins': [1, 1, 1, 1, 1], 'del': [4, 11, 15, 30, 22], 'sub': [9, 14, 55, 44, 28]})


### 脱線：`remove_minor_indels`が動くかテスト

## 変異部位の抽出

- preprocess.correct後のサンプルについてコンセンサス配列を取り、（`N`を無視して）1%以上の変異がある塩基配列部位のみを抽出する

In [7]:
import midsv
allele="control"
print(f"{SAMPLE_NAME}_splice_{allele}.jsonl")

barcode31_splice_control.jsonl


In [8]:
midsv_sample = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{SAMPLE_NAME}_splice_{allele}.jsonl")))
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in midsv_sample]
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]

In [9]:
def calc_percent_indels(cssplits):
    percelt_indels = []
    cssplits_transposed = [list(t) for t in zip(*cssplits)]
    for cssplit in cssplits_transposed:
        coverage = 0
        count_indelsub = {"ins":0, "del":0, "sub":0}
        for cs in cssplit:
            if cs == "N":
                continue
            coverage += 1
            if cs.startswith("+"):
                count_indelsub["ins"] += 1
            elif cs.startswith("-"):
                count_indelsub["del"] += 1
            elif cs.startswith("*"):
                count_indelsub["sub"] += 1
        if coverage == 0:
            per_indels = {"ins":0, "del":0, "sub":0}
        else:
            per_indels = {mutation: (count / coverage * 100) for mutation, count in count_indelsub.items()}
        percelt_indels.append(per_indels)
    return percelt_indels

In [10]:
from copy import deepcopy
cssplits = cssplits_sample
x = calc_percent_indels(cssplits)
for i, xx in enumerate(x):
    if sum(xx.values()) > 1:
        print(i, xx)

188 {'ins': 1.6227180527383367, 'del': 0.0, 'sub': 0.0}
245 {'ins': 1.7258883248730965, 'del': 0.0, 'sub': 0.0}
246 {'ins': 1.015228426395939, 'del': 0.0, 'sub': 0.0}
325 {'ins': 1.3224821973550356, 'del': 0.0, 'sub': 0.0}
335 {'ins': 2.034587995930824, 'del': 0.0, 'sub': 0.0}
336 {'ins': 1.119023397761953, 'del': 0.0, 'sub': 0.0}
337 {'ins': 1.017293997965412, 'del': 0.0, 'sub': 0.0}
365 {'ins': 0.0, 'del': 2.136317395727365, 'sub': 0.0}
475 {'ins': 0.0, 'del': 0.0, 'sub': 1.2269938650306749}
542 {'ins': 1.3360739979445015, 'del': 0.0, 'sub': 0.0}
825 {'ins': 0.0, 'del': 0.9316770186335404, 'sub': 0.6211180124223602}
826 {'ins': 0.0, 'del': 1.5527950310559007, 'sub': 0.8281573498964804}
827 {'ins': 0.0, 'del': 1.6563146997929608, 'sub': 1.7598343685300208}
828 {'ins': 0.0, 'del': 13.457556935817806, 'sub': 84.36853002070393}
829 {'ins': 0.0, 'del': 1.0351966873706004, 'sub': 3.3126293995859215}
830 {'ins': 0.0, 'del': 0.0, 'sub': 5.383022774327122}
872 {'ins': 0.0, 'del': 1.2409513960

In [11]:
cssplits = cssplits_control
x = calc_percent_indels(cssplits)
for i, xx in enumerate(x):
    if sum(xx.values()) > 1:
        print(i, xx)

188 {'ins': 1.0090817356205852, 'del': 0.0, 'sub': 0.0}
189 {'ins': 1.6145307769929365, 'del': 0.0, 'sub': 0.0}
215 {'ins': 2.330293819655522, 'del': 0.0, 'sub': 0.0}
335 {'ins': 1.4285714285714286, 'del': 0.0, 'sub': 0.0}
336 {'ins': 1.3265306122448979, 'del': 0.0, 'sub': 0.0}
365 {'ins': 0.0, 'del': 2.1450459652706844, 'sub': 0.0}
367 {'ins': 0.0, 'del': 1.4300306435137897, 'sub': 0.0}
478 {'ins': 0.0, 'del': 0.0, 'sub': 1.5384615384615385}
479 {'ins': 0.0, 'del': 0.0, 'sub': 1.4344262295081966}
540 {'ins': 1.1293634496919918, 'del': 0.0, 'sub': 0.0}
542 {'ins': 2.5667351129363447, 'del': 0.0, 'sub': 0.0}
625 {'ins': 1.2371134020618557, 'del': 0.0, 'sub': 0.0}
627 {'ins': 1.134020618556701, 'del': 0.0, 'sub': 0.0}
825 {'ins': 0.0, 'del': 0.3112033195020747, 'sub': 0.8298755186721992}
826 {'ins': 0.0, 'del': 1.0373443983402488, 'sub': 1.3485477178423237}
827 {'ins': 0.0, 'del': 1.4522821576763485, 'sub': 5.601659751037345}
828 {'ins': 0.0, 'del': 3.008298755186722, 'sub': 4.4605809128

In [12]:
percent_sample = calc_percent_indels(cssplits_sample)
percent_control = calc_percent_indels(cssplits_control)
mutation_loci = set()
for i, (samp, cont) in enumerate(zip(percent_sample, percent_control)):
    for mutation_type in ["ins", "del", "sub"]:
        if i in mutation_loci:
            break
        if abs(samp[mutation_type] - cont[mutation_type]) > 1:
            mutation_loci.add(i)

In [16]:
i=189
print(percent_sample[i])
print(percent_control[i])

{'ins': 0.4056795131845842, 'del': 0.0, 'sub': 0.0}
{'ins': 1.6145307769929365, 'del': 0.0, 'sub': 0.0}


In [None]:
from __future__ import annotations

def calc_percent_indels(cssplits):
    percelt_indels = []
    cssplits_transposed = [list(t) for t in zip(*cssplits)]
    for cssplit in cssplits_transposed:
        coverage = 0
        count_indelsub = {"ins":0, "del":0, "sub":0}
        for cs in cssplit:
            if cs == "N":
                continue
            coverage += 1
            if cs.startswith("+"):
                count_indelsub["ins"] += 1
            elif cs.startswith("-"):
                count_indelsub["del"] += 1
            elif cs.startswith("*"):
                count_indelsub["sub"] += 1
        if coverage == 0:
            per_indels = {"ins":0, "del":0, "sub":0}
        else:
            per_indels = {mutation: (count / coverage * 100) for mutation, count in count_indelsub.items()}
        percelt_indels.append(per_indels)
    return percelt_indels

def extract_mutation_loci(cssplits_sample, cssplits_control) -> set():
    percent_sample = calc_percent_indels(cssplits_sample)
    percent_control = calc_percent_indels(cssplits_control)
    mutation_loci = set()
    for i, (samp, cont) in enumerate(zip(percent_sample, percent_control)):
        for mutation_type in ["ins", "del", "sub"]:
            if i in mutation_loci:
                break
            if abs(samp[mutation_type] - cont[mutation_type]) > 1:
                mutation_loci.add(i)
    return mutation_loci

In [None]:
midsv_sample = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{SAMPLE_NAME}_splice_{allele}.jsonl")))
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in midsv_sample]
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]

i=189
print(percent_sample[i])
print(percent_control[i])

{'ins': 0.4056795131845842, 'del': 0.0, 'sub': 0.0}
{'ins': 1.6145307769929365, 'del': 0.0, 'sub': 0.0}


In [None]:
import numpy as np

In [79]:
def edit_sequence(sequence: str):
    list_seq = list(sequence)
    list_seq[0] = "X"
    return "".join(list_seq)


def edit_list_sequence(list_sequence: list):
    list_sequence_edit = list_sequence.copy()
    list_sequence_edit[0] = "X"
    return list_sequence_edit

In [98]:
sequence = ["A" * 5]
list_sequence = list(sequence)

x = edit_sequence(sequence)
print(x[:5])
print(sequence[:5])

XAAAA
['AAA', 'AAA', 'AAA']


In [95]:
x = edit_list_sequence(list_sequence)
print(x[:5])
print(list_sequence[:5])

['AAA', 'AAA', 'AAA']
['X', 'AAA', 'AAA']
['AAA', 'AAA', 'AAA']


In [123]:
from copy import deepcopy

In [124]:
%%timeit
# 2D list
list_sequence = [["."] * 100 for _ in range(100)]
# Deep copy
list_sequence_edit = deepcopy(list_sequence)
# Edit
list_sequence_edit[0][0] = "@"
# print(list_sequence)
# print(list_sequence_edit)

1.8 ms ± 3.83 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [125]:
%%timeit
# 1D list
sequence = ["." * 100 for _ in range(100)]
# Shallow copy
sequence_edit = sequence.copy()
# Split to 2D list
sequence_edit = [list(s) for s in sequence_edit]
# Edit
sequence_edit[0][0] = "@"
# Revert to 1D list
sequence_edit = ["".join(s) for s in sequence_edit]
# print(sequence)
# print(sequence_edit)

95.1 µs ± 454 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### deepcopyは遅い
- 二次元配列を操作する際には、リストのコピーを作成することで、元のリストを変更しないようにしたい
- 一方でdeepcopyは遅く、copyはshallow copyなので、コピーしたリストを変更するともとのリストまで変更される
- なのでもとのオブジェクトが文字列ならばオブジェクトは1次元の文字列で持っておき、関数の中でその都度リストに変換して操作するとオリジナルの文字列は変更されず、かつこちらのほうがdeepcopyを使うよりも遥かに高速

# まとめ


- `preprocess.extract_mutation_locy`を`quickstart.py`に組み込んだ 

# 次に取り組むこと

### Lists

+ [ ] mutation_lociを利用したアレルの分類・クラスタリング手法を考える
+ [ ] cis変異の両端が欠失している場合に、Nで置き換えるとtransとなってしまうのをどうするか（`replace_n`）
+ [ ] 短いリードの扱いをどうするべきか
+ [ ] Insertionのなかにある変異を同定する手法を考案する
+ [ ] Ayabe-taks1のright_loxpがいまいちな理由を考察する
+ [ ] `preprocess.correct_sequence_error.replace_atmark`のコードがわかりにくい
    + テストを用意してリファクタリングする

### Focus
+ [ ] cis変異の両端が欠失している場合に、Nで置き換えるとtransとなってしまうのをどうするか（`replace_n`）
+ [ ] 短いリードの扱いをどうするべきか

両者については、`correct_sequence_error`などで補正するときに、**変異候補の塩基配列のみを対象とする**ことで対応できる可能性がある

- 変異候補の塩基配列のみを対象とする
    - 両端が欠失しているようなリードについて、変異候補部位を含まないリードは`uncategorized`といったカテゴリにできる
    - よって**変異候補部位を含むか含まないか**を考えることで、短いリードや両端が欠失しているリードの分類が可能になる？