# 今回の取り組み

- Nanoporeに最後まで読まれなかった（途中で切断された）リードがクラスタに分かれてしまうので、これを除去したい

- これはControlにも存在するはずだが、現状ではクラスラリングの際にControlにおいて切断リードの大半が含まれるであろうSVを無視しているため、差分が効いていない

- (1) SV判定をコンセンサスコールのあとにずらし、(2) クラスタリングの際にSampleとControlに同様に見られる切断リードを除く

## いつものセットアップ

In [1]:
# ルートディレクトリをPathに含めるおまじない
import sys, os
from pathlib import Path
if Path(os.getcwd()).stem != "DAJIN2":
    parent_path = str(Path(os.path.dirname(os.path.abspath("__file__"))).parent.parent)
    sys.path.append(parent_path)
    os.chdir(parent_path)
print(os.getcwd())

/mnt/d/Research/DAJIN2


In [2]:
%%bash
# pipの更新
pip install -q -U pip
pip install -q -U -r requirements.txt

# 実験

## clusteringにmutation_lociを加える

In [3]:
from __future__ import annotations

import sys, os
from pathlib import Path

import hashlib
from collections import defaultdict
from pathlib import Path
from importlib import reload

from src.DAJIN2.core import preprocess, classification, clustering, consensus, report
from src.DAJIN2.core.clustering import clustering

reload(preprocess)
reload(classification)
reload(clustering)
reload(consensus)
reload(report)


#### #* 2-cut deletion
SAMPLE, CONTROL, ALLELE, NAME, GENOME, DEBUG, THREADS = (
    "examples/del-stx2/barcode25.fq.gz",
    "examples/del-stx2/barcode30.fq.gz",
    "examples/del-stx2/design_stx2.fa",
    "test-stx2-deletion",
    "mm10",
    True,
    14,
)

print(f"processing {NAME}...")

##########################################################
# Check inputs
##########################################################
preprocess.check_inputs.check_files(SAMPLE, CONTROL, ALLELE)
TEMPDIR = Path("DAJINResults", ".tempdir", NAME)
IS_CACHE_CONTROL = preprocess.check_inputs.exists_cached_control(CONTROL, TEMPDIR)
IS_CACHE_GENOME = preprocess.check_inputs.exists_cached_genome(GENOME, TEMPDIR, IS_CACHE_CONTROL)
UCSC_URL, GOLDENPATH_URL = None, None
if GENOME and not IS_CACHE_GENOME:
    UCSC_URL, GOLDENPATH_URL = preprocess.check_inputs.check_and_fetch_genome(GENOME)

##########################################################
# Format inputs
##########################################################
SAMPLE_NAME = preprocess.format_inputs.extract_basename(SAMPLE)
CONTROL_NAME = preprocess.format_inputs.extract_basename(CONTROL)
FASTA_ALLELES = preprocess.format_inputs.dictionize_allele(ALLELE)
THREADS = min(THREADS, os.cpu_count()-1)

preprocess.format_inputs.make_directories(TEMPDIR, SAMPLE_NAME, CONTROL_NAME)

if GENOME:
    GENOME_COODINATES = preprocess.format_inputs.fetch_coodinate(GENOME, UCSC_URL, FASTA_ALLELES["control"])
    CHROME_SIZE = preprocess.format_inputs.fetch_chrom_size(GENOME_COODINATES["chr"], GENOME, GOLDENPATH_URL)
    preprocess.format_inputs.cache_coodinates_and_chromsize(TEMPDIR, GENOME, GENOME_COODINATES, CHROME_SIZE)


processing test-stx2-deletion...


In [4]:

flag1 = Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_control.jsonl").exists()
flag2 = Path(TEMPDIR, "midsv", f"{SAMPLE_NAME}_splice_control.jsonl").exists()
flag = flag1 and flag2

if not flag:
    print("preprocessing...")
    ################################################################################
    # Export fasta files as single-FASTA format
    ################################################################################
    # TODO: use yeild, not export
    for identifier, sequence in FASTA_ALLELES.items():
        contents = "\n".join([">" + identifier, sequence]) + "\n"
        output_fasta = Path(TEMPDIR, "fasta", f"{identifier}.fasta")
        output_fasta.write_text(contents)
    ###############################################################################
    # Mapping with mappy
    ###############################################################################
    for path_fasta in Path(TEMPDIR, "fasta").glob("*.fasta"):
        name_fasta = path_fasta.stem
        preprocess.mappy_align.output_sam(TEMPDIR, path_fasta, name_fasta, CONTROL, CONTROL_NAME, threads=THREADS)
        preprocess.mappy_align.output_sam(TEMPDIR, path_fasta, name_fasta, SAMPLE, SAMPLE_NAME, threads=THREADS)
        preprocess.mappy_align.output_sam(
            TEMPDIR, path_fasta, name_fasta, CONTROL, CONTROL_NAME, preset="splice", threads=THREADS
        )
        preprocess.mappy_align.output_sam(
            TEMPDIR, path_fasta, name_fasta, SAMPLE, SAMPLE_NAME, preset="splice", threads=THREADS
        )
    ########################################################################
    # MIDSV conversion
    ########################################################################
    for path_sam in Path(TEMPDIR, "sam").glob(f"{CONTROL_NAME}_splice_*"):
        preprocess.calc_midsv.output_midsv(TEMPDIR, path_sam)
    for path_sam in Path(TEMPDIR, "sam").glob(f"{SAMPLE_NAME}_splice_*"):
        preprocess.calc_midsv.output_midsv(TEMPDIR, path_sam)
    ###############################################################################
    # CSSPLITS Error Correction
    ###############################################################################
    preprocess.correct_sequence_error.execute(TEMPDIR, FASTA_ALLELES, CONTROL_NAME, SAMPLE_NAME)
    preprocess.correct_knockin.execute(TEMPDIR, FASTA_ALLELES, CONTROL_NAME, SAMPLE_NAME)
    ###############################################################################
    # Convert any `N` as deletions other than consecutive `N` from both ends
    ###############################################################################
    preprocess.replace_N_to_D.execute(TEMPDIR, FASTA_ALLELES, CONTROL_NAME)
    preprocess.replace_N_to_D.execute(TEMPDIR, FASTA_ALLELES, SAMPLE_NAME)
    ###############################################################################
    # Cashe inputs (control)
    ###############################################################################
    if not IS_CACHE_CONTROL:
        control_hash = Path(CONTROL).read_bytes()
        control_hash = hashlib.sha256(control_hash).hexdigest()
        PATH_CACHE_HASH = Path(TEMPDIR, "cache", "control_hash.txt")
        PATH_CACHE_HASH.write_text(str(control_hash))


In [5]:
MUTATION_LOCI = preprocess.extract_mutation_loci(TEMPDIR, FASTA_ALLELES, SAMPLE_NAME, CONTROL_NAME)

In [6]:
MUTATION_LOCI.keys()

dict_keys(['target', 'control', 'inversion'])

In [7]:
########################################################################
# Classify alleles
########################################################################
print("Classify...")

classif_sample = classification.classify_alleles(TEMPDIR, SAMPLE_NAME)


Classify...


In [8]:
classif_sample[0]

{'QNAME': '0013cef2-2bed-49c3-9253-0672c5f77a77',
 'RNAME': 'target',
 'CSSPLIT': 'N,N,N,=C,=C,=A,=G,=G,=G,=T,=G,=T,=C,=T,=C,=A,=T,=A,=G,=T,=G,=T,=T,=T,=G,=A,=A,=G,=G,=C,=T,=C,=C,=T,=A,=A,=A,=T,=T,=G,=C,=C,=C,=A,=G,=T,=G,=T,=T,=C,=A,=G,=C,=T,=G,=G,=G,=G,=A,=A,=A,=G,=A,=C,=C,=A,=T,=C,=A,=G,=C,=T,=A,=G,=G,=C,=A,=G,=G,=A,=T,=C,=C,=A,=A,=A,=G,=G,=A,=T,=A,=A,=T,=G,=A,=G,=T,=G,=T,=G,=C,=C,=C,=C,=A,=T,=G,=G,=G,=A,=C,=T,=C,=T,=G,=G,=C,=T,=T,=A,=G,=C,=C,=G,=C,=A,=G,=C,=T,=C,=T,=A,=C,=C,=C,=T,=A,=A,=G,=C,=C,=C,=A,=C,=A,=G,=T,=T,=G,=A,=G,=G,=G,=G,=T,=A,=G,=T,=G,=G,=A,=T,=C,=T,=T,=G,=T,=G,=T,=T,=T,=G,=A,=G,=G,=A,=C,=A,=T,=T,=A,=A,=C,=A,=A,=C,=A,=G,=G,=C,=T,=G,=A,=T,=T,=G,=G,=G,=A,=A,=A,=G,=T,=A,=G,=T,=G,=G,=T,=T,=G,=C,=T,=T,=G,=A,=T,=G,=G,=A,=G,=T,=T,=G,=G,=G,=C,=T,=A,=G,=C,=G,=A,=T,=G,=G,=A,=G,=G,=T,=G,=A,=G,=T,=G,=A,=G,=T,=C,=T,=G,=G,=A,=G,=G,=C,=C,=A,=G,=T,=T,=G,=T,=G,=T,=G,=C,=C,=G,=C,=A,=T,=A,=C,=T,=A,=A,=C,=A,=G,=A,=G,=G,=A,=G,=T,=A,=A,=G,=C,=A,=C,=C,=A,=G,=C,=T,=A,=G,=A,=T,=T,=T,=T,=C,=A,=T

In [9]:
from __future__ import annotations
from itertools import groupby
from collections import defaultdict
from pathlib import Path
import midsv

from src.DAJIN2.core.preprocess.correct_knockin import extract_knockin_loci
from src.DAJIN2.core.clustering.preprocess import replace_both_ends_n, compress_insertion
from src.DAJIN2.core.clustering.make_score import make_score
from src.DAJIN2.core.clustering.return_labels import return_labels


def extract_cssplits_in_mutation(cssplits_sample: list[list], mutation_loci: set) -> list[list]:
    cssplits_mutation = []
    for cssplits in cssplits_sample:
        cs_mutation = []
        for i, cs in enumerate(cssplits):
            if i in mutation_loci:
                cs_mutation.append(cs)
        cssplits_mutation.append(cs_mutation)
    return cssplits_mutation


def annotate_score(cssplits: list[list], mutation_score: list[dict]):
    scores = []
    for cssplit in cssplits:
        score = [0]
        for i in range(1, len(cssplit) - 1):
            if not mutation_score[i]:
                score.append(0)
                continue
            kmer = ",".join([cssplit[i - 1], cssplit[i], cssplit[i + 1]])
            score.append(mutation_score[i].get(kmer, 0))
        scores.append(score + [0])
    return scores


In [10]:
allele = "target"
mutation_loci = MUTATION_LOCI[allele]
knockin_alleles = extract_knockin_loci(TEMPDIR)

In [11]:
# control
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]
# Sample
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in classif_sample if cs["ALLELE"] == allele]

In [31]:
cssplits_control = compress_insertion(cssplits_control)
cssplits_sample = compress_insertion(cssplits_sample)
mutation_score = make_score(cssplits_control, cssplits_sample, knockin_alleles[allele])
scores_control = annotate_score(cssplits_control, mutation_score)
scores_sample = annotate_score(cssplits_sample, mutation_score)
labels = return_labels(scores_sample, scores_control)


In [32]:
from collections import Counter
Counter(labels)

Counter({2: 1145, 3: 1191, 1: 1512, 4: 473})

In [44]:
count = 0
for cs, label in zip(cssplits_sample, labels):
    if label == 4:
        print(cs[:10])
        print(cs[-10:])
        print("=====================================")
        count += 1
    if count == 10:
        break

['=G', '=C', '=T', '=C', '=C', '=A', '=G', '=G', '=G', '=T']
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
['N', 'N', 'N', 'N', '=C', '=A', '=G', '=G', '=G', '=T']
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
['=G', '=C', '=T', '=C', '=C', '=A', '=G', '=G', '=G', '=T']
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
['=G', '=C', '=T', '=C', '=C', '=A', '=G', '=G', '=G', '=T']
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
['=G', '=C', '=T', '=C', '=C', '=A', '=G', '=G', '=G', '=T']
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
['=G', '=C', '=T', '=C', '=C', '=A', '=G', '=G', '=G', '=T']
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
['=G', '=C', '=T', '=C', '=C', '=A', '=G', '=G', '=G', '=T']
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
['=G', '=C', '=T', '=C', '=C', '=A', '=G', '=G', '=G', '=T']
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
['=G', '=C', '=T', '=C', '=C', '=A', '=G', '=G', '=G', '=T']
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', '

### `return_labels`にてControlで切断リードのクラスタがあるか確認する

In [45]:
from __future__ import annotations
import numpy as np
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.exceptions import ConvergenceWarning
from collections import Counter

from src.DAJIN2.core.clustering.merge_clusters import merge_clusters

import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

###############################################################################
# Dimension reduction
###############################################################################


def reduce_dimension(scores_sample: list[list], scores_control_subset: list[list]) -> np.array:
    scores = scores_sample + scores_control_subset
    n_components = min(20, len(scores[0]))
    pca = PCA(n_components=n_components).fit(scores)
    return pca.transform(scores)


def optimize_labels(X: np.array, scores_sample: list[list], scores_control_subset: list[list]) -> list[int]:
    scores = scores_sample + scores_control_subset
    n_components = min(20, len(scores))
    labels_results = [1] * len(scores_sample)
    for i in range(1, n_components):
        np.random.seed(seed=1)
        labels = GaussianMixture(n_components=i, random_state=1).fit_predict(X)
        labels = labels.tolist()
        labels_sample = labels[: len(scores_sample)]
        labels_control = labels[len(scores_sample) :]
        labels_merged = merge_clusters(labels_control, labels_sample)
        # Reads < 1% in the control are considered clustering errors and are not counted
        count_control = Counter(labels_control)
        num_labels_control = sum(1 for reads in count_control.values() if reads / sum(count_control.values()) * 100 > 1)
        if num_labels_control > 1:
            return labels_results
        labels_results = labels_merged
    return labels_results


###############################################################################
# main
###############################################################################


def return_labels(scores_sample: list[list], scores_control: list[list]) -> list[int]:
    np.random.seed(seed=1)
    X_control = reduce_dimension([], scores_control)
    labels = GaussianMixture(n_components=2, random_state=1).fit_predict(X_control)
    label_most = Counter(labels).most_common()[0][0]
    scores_control_subset = [s for l, s in zip(labels, scores_control) if l == label_most][:1000]
    X = reduce_dimension(scores_sample, scores_control_subset)
    labels = optimize_labels(X, scores_sample, scores_control_subset)
    return labels


In [49]:
X_control = reduce_dimension([], scores_control)
labels = GaussianMixture(n_components=4, random_state=1).fit_predict(X_control)
Counter(labels)

Counter({2: 696, 0: 1457, 1: 76, 3: 222})

In [65]:
count = 0
for cs, score, label in zip(cssplits_control, scores_control, labels):
    if label == 3:
        print(cs[-10:])
        print(score[-10:])
        print("=====================================")
        count += 1
    if count == 5:
        break

['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### 末端から連続するNのスコアを反映させる

In [66]:
from __future__ import annotations
from collections import defaultdict
from collections import Counter


def call_count(cssplits: list[list[str]]) -> list[dict[str, int]]:
    """Count cssplits within 3-mer range.
    Args:
        cssplits (list[list[str]])
    Returns:
        list[dict[str, int]]: Both ends are counted as "N" to keep sequence length.
    """
    count_kmer = defaultdict(Counter)
    for cssplit in cssplits:
        for i in range(1, len(cssplit) - 1):
            kmer = ",".join([cssplit[i - 1], cssplit[i], cssplit[i + 1]])
            count_kmer[i] += Counter([kmer])
    coverage = len(cssplits)
    count_score = [{"N": coverage}]
    count_score += [dict(count_kmer[i]) for i in range(1, len(cssplit) - 1)]
    count_score += [{"N": coverage}]
    return count_score


def call_percent(counts: list[dict[str:int]]) -> list[dict[str:float]]:
    cssplit_percent = []
    coverage = sum(counts[0].values())
    for count in counts:
        percent = {k: v / coverage * 100 for k, v in count.items()}
        cssplit_percent.append(percent)
    return cssplit_percent


def subtract_percentage(percent_control, percent_sample, knockin_loci) -> list[dict]:
    sample_subtracted = []
    for i, (cont, samp) in enumerate(zip(percent_control, percent_sample)):
        if i in knockin_loci:
            sample_subtracted.append(samp)
            continue
        samp = Counter(samp)
        samp.subtract(Counter(cont))
        sample_subtracted.append(dict(samp))
    return sample_subtracted


def discard_common_error(sample_subtracted, threshold=0.5):
    sample_discarded = []
    for samp in sample_subtracted:
        remained = {k: v for k, v in samp.items() if v > threshold}
        sample_discarded.append(remained)
    return sample_discarded


def discard_match(sample_subtracted):
    sample_discarded = []
    for samp in sample_subtracted:
        remained = {k: v for k, v in samp.items() if not k.split(",")[1].startswith("=")}
        sample_discarded.append(remained)
    return sample_discarded


###############################################################################
# main
###############################################################################


def make_score(cssplits_control, cssplits_sample, knockin_loci):
    counts_control = call_count(cssplits_control)
    counts_sample = call_count(cssplits_sample)
    percent_control = call_percent(counts_control)
    percent_sample = call_percent(counts_sample)
    percent_subtraction = subtract_percentage(percent_control, percent_sample, knockin_loci)
    percent_discarded = discard_common_error(percent_subtraction, 0.5)
    mutation_score = discard_match(percent_discarded)
    return mutation_score


In [86]:
knockin_loci = knockin_alleles[allele]
counts_control = call_count(cssplits_control)
counts_sample = call_count(cssplits_sample)
percent_control = call_percent(counts_control)
percent_sample = call_percent(counts_sample)
percent_subtraction = subtract_percentage(percent_control, percent_sample, knockin_loci)
percent_discarded = discard_common_error(percent_subtraction, 0.5)
mutation_score = discard_match(percent_discarded)

In [None]:
mutation_score[-10:]

[{}, {}, {}, {'=T,N,N': 0.50074730158928}, {}, {}, {}, {}, {}, {}]

In [74]:
print(percent_control[-2])
print(percent_sample[-2])
print(percent_subtraction[-2])
print(percent_discarded[-2])

{'=T,=G,=T': 68.9514483884129, 'N,N,N': 25.336597307221542, '=T,N,N': 1.346389228886169, '=T,=G,N': 4.324765401876785, '=t,=g,=t': 0.04079967360261118}
{'=T,=G,=T': 70.28465632955334, '=T,=G,N': 4.628558204119416, 'N,N,N': 23.999074288359175, '=T,N,N': 1.0645683869474658, '=t,=g,=t': 0.023142791020597086}
{'=T,=G,=T': 1.3332079411404436, '=T,=G,N': 0.30379280224263105, 'N,N,N': -1.3375230188623668, '=T,N,N': -0.28182084193870316, '=t,=g,=t': -0.017656882582014092}
{'=T,=G,=T': 1.3332079411404436}


- `discard_common_error`をすることでNの情報が消えている
- `discard_common_error`を無くすとどうなる？

In [95]:
def discard_match(sample_subtracted):
    sample_discarded = []
    for samp in sample_subtracted:
        if samp.keys() == {"N"}:
            sample_discarded.append(samp)
            continue
        remained = {k: v for k, v in samp.items() if not k.split(",")[1].startswith("=")}
        sample_discarded.append(remained)
    return sample_discarded

percent_subtraction = subtract_percentage(percent_control, percent_sample, knockin_loci)
mutation_score = discard_match(percent_subtraction)

In [97]:
print(mutation_score[-2:])
print(len(mutation_score))

[{'N,N,N': -1.3375230188623668, '=T,N,N': -0.28182084193870316}, {'N': 0.0}]
3582


In [98]:
allele = "target"
mutation_loci = MUTATION_LOCI[allele]
knockin_alleles = extract_knockin_loci(TEMPDIR)
# control
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]
# Sample
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in classif_sample if cs["ALLELE"] == allele]
cssplits_control = compress_insertion(cssplits_control)
cssplits_sample = compress_insertion(cssplits_sample)

scores_control = annotate_score(cssplits_control, mutation_score)
scores_sample = annotate_score(cssplits_sample, mutation_score)

In [99]:
print(scores_control[-2:])
print(scores_sample[-2:])


[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [107]:
X_control = reduce_dimension([], scores_control)
labels = GaussianMixture(n_components=5, random_state=1).fit_predict(X_control)
Counter(labels)

Counter({2: 691, 0: 1462, 3: 37, 4: 222, 1: 39})

In [109]:
count = 0
for cs, score, label in zip(cssplits_control, scores_control, labels):
    if label == 4:
        print(cs[-10:])
        print(score[-10:])
        print("=====================================")
        count += 1
    if count == 5:
        break

['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[-1.3874343992519549, -1.4945748520103024, -1.9469687334378243, -1.8897113345194612, -1.3889640329301791, -1.4961044856885337, -1.5483858540610527, -1.3424990494082074, -1.3375230188623668, 0]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[-1.3874343992519549, -1.4945748520103024, -1.9469687334378243, -1.8897113345194612, -1.3889640329301791, -1.4961044856885337, -1.5483858540610527, -1.3424990494082074, -1.3375230188623668, 0]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[-1.3874343992519549, -1.4945748520103024, -1.9469687334378243, -1.8897113345194612, -1.3889640329301791, -1.4961044856885337, -1.5483858540610527, -1.3424990494082074, -1.3375230188623668, 0]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[-1.3874343992519549, -1.4945748520103024, -1.9469687334378243, -1.8897113345194612, -1.3889640329301791, -1.4961044856885337, -1.5483858540610527, -1.3424990494082074, -1.3375230188623668, 0]
['N', 'N', 'N', 'N', 'N'

- `N`のところだけを抽出しようとしても難しい
- 逆に、いまの段階でもsampleにおいて切断リードのクラスタを検出できている
- つまり、**sampleとcontrolを入れ替えれば、controlにおける切断リードのクラスタを検出できるかもしれない**

In [18]:
from __future__ import annotations
from collections import defaultdict
from collections import Counter


def call_count(cssplits: list[list[str]]) -> list[dict[str, int]]:
    """Count cssplits within 3-mer range.
    Args:
        cssplits (list[list[str]])
    Returns:
        list[dict[str, int]]: Both ends are counted as "N" to keep sequence length.
    """
    count_kmer = defaultdict(Counter)
    for cssplit in cssplits:
        for i in range(1, len(cssplit) - 1):
            kmer = ",".join([cssplit[i - 1], cssplit[i], cssplit[i + 1]])
            count_kmer[i] += Counter([kmer])
    coverage = len(cssplits)
    count_score = [{"N,N,N": coverage}]
    count_score += [dict(count_kmer[i]) for i in range(1, len(cssplit) - 1)]
    count_score += [{"N,N,N": coverage}]
    return count_score


def call_percent(counts: list[dict[str:int]]) -> list[dict[str:float]]:
    cssplit_percent = []
    coverage = sum(counts[0].values())
    for count in counts:
        percent = {k: v / coverage * 100 for k, v in count.items()}
        cssplit_percent.append(percent)
    return cssplit_percent


def subtract_percentage(percent_control, percent_sample, knockin_loci) -> list[dict]:
    sample_subtracted = []
    for i, (cont, samp) in enumerate(zip(percent_control, percent_sample)):
        if i in knockin_loci:
            sample_subtracted.append(samp)
            continue
        samp = Counter(samp)
        samp.subtract(Counter(cont))
        sample_subtracted.append(dict(samp))
    return sample_subtracted


def discard_common_error(sample_subtracted, threshold=0.5):
    sample_discarded = []
    for samp in sample_subtracted:
        remained = {k: v for k, v in samp.items() if v > threshold}
        sample_discarded.append(remained)
    return sample_discarded


def discard_match(sample_subtracted):
    sample_discarded = []
    for samp in sample_subtracted:
        remained = {k: v for k, v in samp.items() if not k.split(",")[1].startswith("=")}
        sample_discarded.append(remained)
    return sample_discarded


###############################################################################
# main
###############################################################################


def make_score(cssplits_control, cssplits_sample, knockin_loci):
    counts_control = call_count(cssplits_control)
    counts_sample = call_count(cssplits_sample)
    percent_control = call_percent(counts_control)
    percent_sample = call_percent(counts_sample)
    percent_subtraction = subtract_percentage(percent_control, percent_sample, knockin_loci)
    percent_discarded = discard_common_error(percent_subtraction, 0.5)
    mutation_score = discard_match(percent_discarded)
    return mutation_score


In [132]:
allele = "target"
mutation_loci = MUTATION_LOCI[allele]
knockin_alleles = extract_knockin_loci(TEMPDIR)
# control
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]
# Sample
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in classif_sample if cs["ALLELE"] == allele]

In [19]:
def extract_cssplits_in_mutation(cssplits_sample: list[list], mutation_loci: set) -> list[list]:
    cssplits_mutation = []
    for cssplits in cssplits_sample:
        cs_mutation = []
        for i, cs in enumerate(cssplits):
            if i in mutation_loci:
                cs_mutation.append(cs)
        cssplits_mutation.append(cs_mutation)
    return cssplits_mutation

def annotate_score(cssplits: list[list], mutation_score: list[dict]):
    scores = []
    for cssplit in cssplits:
        score = [0]
        for i in range(1, len(cssplit) - 1):
            if not mutation_score[i]:
                score.append(0)
                continue
            kmer = ",".join([cssplit[i - 1], cssplit[i], cssplit[i + 1]])
            score.append(mutation_score[i].get(kmer, 0))
        scores.append(score + [0])
    return scores

In [135]:
cssplits_control = extract_cssplits_in_mutation(cssplits_control, mutation_loci)
cssplits_sample = extract_cssplits_in_mutation(cssplits_sample, mutation_loci)
cssplits_control = compress_insertion(cssplits_control)
cssplits_sample = compress_insertion(cssplits_sample)
mutation_score = make_score(cssplits_control, cssplits_sample, knockin_alleles[allele])

In [136]:
scores_control = annotate_score(cssplits_control, mutation_score)
scores_sample = annotate_score(cssplits_sample, mutation_score)

In [137]:
%%time
# 確認
labels = return_labels(scores_sample, scores_control)
Counter(labels)

CPU times: user 27.9 s, sys: 38.3 s, total: 1min 6s
Wall time: 3.96 s


Counter({2: 1145, 3: 1190, 1: 1513, 4: 473})

In [138]:
%%time
# sampleとcontrolを逆にして検証
labels_control = return_labels(scores_control, scores_sample)
Counter(labels_control)

CPU times: user 24.9 s, sys: 38 s, total: 1min 2s
Wall time: 3.84 s


Counter({0: 2375, 2: 76})

In [115]:
count = 0
for cs, score, label in zip(cssplits_control, scores_control, labels_control):
    if label == 2:
        print(cs[-10:])
        print(score[-10:])
        print("=====================================")
        count += 1
    if count == 5:
        break

['=A', '=T', '=T', '=A', '=C', '=T', '=G', '=T', '=G', '=T']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['=A', '=T', '=T', '=A', '=C', '=T', '=G', '=T', '=G', '=T']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['=A', '=T', '=T', '=A', '=C', '=T', '=G', '=T', 'N', 'N']
[0, 0, 0, 0, 0, 0, 0, 0, -0.28182084193870316, 0]
['=A', '=T', '=T', '=A', '=C', '=T', '=G', '=T', '=G', '=T']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['=A', '=T', '=T', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[0, 0, 0, 0.50074730158928, -1.3889640329301791, -1.4961044856885337, -1.5483858540610527, -1.3424990494082074, -1.3375230188623668, 0]


- ダメそうですね…

- シンプルに、両端から`N`が続く場合にはたとえば`1`などの数値を当ててみるとどうでしょう？

In [20]:
def annotate_score(cssplits: list[list], mutation_score: list[dict]):
    scores = []
    for cssplit in cssplits:
        score = [0]
        for i in range(1, len(cssplit) - 1):
            if not mutation_score[i]:
                score.append(0)
                continue
            kmer = ",".join([cssplit[i - 1], cssplit[i], cssplit[i + 1]])
            score.append(mutation_score[i].get(kmer, 0))
        scores.append(score + [0])
    return scores

def update_score(scores: list[list], cssplits: list[list]):
    # 両端からNが続くようであれば、その位置のスコアは`1`とする
    for i, cssplit in enumerate(cssplits):
        flag_n_start = True
        flag_n_end = True
        for j, (start, end) in enumerate(zip(cssplit, cssplit[::-1])):
            if j == (len(cssplit) + 1) // 2:
                break
            if flag_n_start == False and flag_n_end == False:
                break
            if flag_n_start:
                if start == "N":
                    scores[i][j] = 1
                else:
                    flag_n_start = False
            if flag_n_end:
                if end == "N":
                    j_reverse = len(cssplit) - j - 1
                    scores[i][j_reverse] = 1
                else:
                    flag_n_end = False
    return scores

In [192]:
scores_control = annotate_score(cssplits_control, mutation_score)
print(scores_control[4][-10:])
scores_control = update_score(scores_control, cssplits_control)
print(scores_control[4][-10:])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]


In [221]:
count = 0
for i, cssplits in enumerate(cssplits_control):
    if all(cs == "N" for cs in cssplits[-100:]):
        count += 1
        if count <= 5:
            print(i)
print("----------")
print(count)

4
13
17
21
22
----------
403


In [13]:
allele = "target"
mutation_loci = MUTATION_LOCI[allele]
knockin_alleles = extract_knockin_loci(TEMPDIR)
# control
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]
# Sample
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in classif_sample if cs["ALLELE"] == allele]

In [14]:
cssplits_control = extract_cssplits_in_mutation(cssplits_control, mutation_loci)
cssplits_sample = extract_cssplits_in_mutation(cssplits_sample, mutation_loci)
cssplits_control = compress_insertion(cssplits_control)
cssplits_sample = compress_insertion(cssplits_sample)
mutation_score = make_score(cssplits_control, cssplits_sample, knockin_alleles[allele])
scores_control = annotate_score(cssplits_control, mutation_score)
scores_sample = annotate_score(cssplits_sample, mutation_score)
scores_control = update_score(scores_control, cssplits_control)
scores_sample = update_score(scores_sample, cssplits_sample)

In [21]:
# まずはサンプルが4つに分かれているか確認
from collections import Counter
labels = return_labels(scores_sample, scores_control)
Counter(labels)

Counter({2: 1145, 1: 1191, 3: 1512, 4: 473})

In [24]:
from __future__ import annotations
import numpy as np
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.exceptions import ConvergenceWarning
from collections import Counter

from src.DAJIN2.core.clustering.merge_clusters import merge_clusters

import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

###############################################################################
# Dimension reduction
###############################################################################


def reduce_dimension(scores_sample: list[list], scores_control_subset: list[list]) -> np.array:
    scores = scores_sample + scores_control_subset
    n_components = min(20, len(scores[0]))
    pca = PCA(n_components=n_components).fit(scores)
    return pca.transform(scores)


def optimize_labels(X: np.array, scores_sample: list[list], scores_control_subset: list[list]) -> list[int]:
    scores = scores_sample + scores_control_subset
    n_components = min(20, len(scores))
    labels_results = [1] * len(scores_sample)
    for i in range(1, n_components):
        np.random.seed(seed=1)
        labels = GaussianMixture(n_components=i, random_state=1).fit_predict(X)
        labels = labels.tolist()
        labels_sample = labels[: len(scores_sample)]
        labels_control = labels[len(scores_sample) :]
        labels_merged = merge_clusters(labels_control, labels_sample)
        # Reads < 1% in the control are considered clustering errors and are not counted
        count_control = Counter(labels_control)
        num_labels_control = sum(1 for reads in count_control.values() if reads / sum(count_control.values()) * 100 > 1)
        if num_labels_control > 1:
            return labels_results
        labels_results = labels_merged
    return labels_results


###############################################################################
# main
###############################################################################


def return_labels(scores_sample: list[list], scores_control: list[list]) -> list[int]:
    np.random.seed(seed=1)
    X_control = reduce_dimension([], scores_control)
    labels = GaussianMixture(n_components=2, random_state=1).fit_predict(X_control)
    label_most = Counter(labels).most_common()[0][0]
    scores_control_subset = [s for l, s in zip(labels, scores_control) if l == label_most][:1000]
    X = reduce_dimension(scores_sample, scores_control_subset)
    labels = optimize_labels(X, scores_sample, scores_control_subset)
    return labels


In [25]:
# sampleとcontrolを逆にして検証
X_control = reduce_dimension(scores_control, [])
labels_control = optimize_labels(X_control, scores_control, scores_sample)
Counter(labels_control)
# 悪くないですね

Counter({0: 1450,
         18: 153,
         4: 330,
         5: 35,
         10: 106,
         14: 71,
         2: 207,
         16: 49,
         1: 18,
         15: 16,
         12: 16})

In [26]:
# またはControlをSampleのラベル数に合わせてクラスタリングする
X_control = reduce_dimension([], scores_control)
np.random.seed(seed=1)
labels_control = GaussianMixture(n_components=len(Counter(labels)), random_state=1).fit_predict(X_control)
Counter(labels_control)

Counter({0: 2147, 1: 60, 2: 223, 3: 21})

In [27]:
count = 0
for cs, score, label in zip(cssplits_sample, scores_sample, labels):
    if label == 4:
        print(cs[-10:])
        print(score[-10:])
        print("=====================================")
        count += 1
    if count == 5:
        break

['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [28]:
count = 0
for cs, score, label in zip(cssplits_control, scores_control, labels_control):
    if label == 2:
        print(cs[-10:])
        print(score[-10:])
        print("=====================================")
        count += 1
    if count == 5:
        break

['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### labels=2とlabels_control=3が似ているということを示したい

In [31]:
label_score = [[label,score] for label, score in zip(labels, scores_sample)]
label_score.sort(key=lambda x: x[0])

In [34]:
for label, score in groupby(label_score, key=lambda x: x[0]):
    score = list(score)
    coverage = len(score)
    print(label, coverage)
    

1 1191
2 1145
3 1512
4 473


### 小括：ちょっと休憩

- 連続するNは一括して`1`のスコアを割り当てた (`update_score`: 名前を変更する必要あり)
- ControlをSampleのラベル数に合わせてクラスタリングすることで、Nが連続するクラスタの抽出ができた

次回は
- サンプルとコントロールの各ラベルごとのスコアのまとめ値に対して類似度を測定する
- あまりにも似ているクラスタの扱いをどうするか
    - サンプルの中でほかにもっとも類似するクラスタに割り当てる？
    - `control`とラベルを付ける？

# まとめ


- `MUTATION_LOCI`を`clustering`に組み込みました
    - 速度の向上が期待できる（はず）です

# 次に取り組むこと

### Lists

+ [ ] 短いリードの扱いをどうするべきか
+ [ ] `SV`の判定をconsensus callのあとにする
+ [ ] cis変異の両端が欠失している場合に、Nで置き換えるとtransとなってしまうのをどうするか（`clustering.replace_both_ends_n`）
+ [ ] Insertionのなかにある変異を同定する手法を考案する
+ [ ] Ayabe-taks1のright_loxpがいまいちな理由を考察する
+ [ ] `preprocess.correct_sequence_error.replace_atmark`のコードがわかりにくい
    + テストを用意してリファクタリングする