In [1]:
import pandas as pd
import numpy as np

from glob import glob


In [2]:
species = "k_pneumoniae"
isolate_prefix = "kpne-"

In [3]:
prediction_files = glob("../results/predictions/*" + isolate_prefix + "*.csv")

In [6]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


140 16 80 32
isolate: kpne-SAMN21366037
recall: 0.813953488372093
specificity: 0.8333333333333334
precision: 0.8974358974358975
accuracy: 0.8208955223880597
f1_score: 0.8536585365853658
16 10 57 7
isolate: kpne-SAMN21366038
recall: 0.6956521739130435
specificity: 0.8507462686567164
precision: 0.6153846153846154
accuracy: 0.8111111111111111
f1_score: 0.6530612244897959
45 2 66 4
isolate: kpne-SAMN21366043
recall: 0.9183673469387755
specificity: 0.9705882352941176
precision: 0.9574468085106383
accuracy: 0.9487179487179487
f1_score: 0.9375000000000001
34 5 32 2
isolate: kpne-SAMN21366050
recall: 0.9444444444444444
specificity: 0.8648648648648649
precision: 0.8717948717948718
accuracy: 0.9041095890410958
f1_score: 0.9066666666666667
35 32 99 6
isolate: kpne-SAMN21366063
recall: 0.8536585365853658
specificity: 0.7557251908396947
precision: 0.5223880597014925
accuracy: 0.7790697674418605
f1_score: 0.6481481481481481
36 7 63 9
isolate: kpne-SAMN21366069
recall: 0.8
specificity: 0.9
precision:

In [7]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


121 64 73 10
isolate: kpne-SAMN21366037
recall: 0.9236641221374046
specificity: 0.5328467153284672
precision: 0.654054054054054
accuracy: 0.7238805970149254
f1_score: 0.7658227848101266
62 10 9 9
isolate: kpne-SAMN21366038
recall: 0.8732394366197183
specificity: 0.47368421052631576
precision: 0.8611111111111112
accuracy: 0.7888888888888889
f1_score: 0.8671328671328671
76 10 31 0
isolate: kpne-SAMN21366043
recall: 1.0
specificity: 0.7560975609756098
precision: 0.8837209302325582
accuracy: 0.9145299145299145
f1_score: 0.9382716049382717
35 3 31 4
isolate: kpne-SAMN21366050
recall: 0.8974358974358975
specificity: 0.9117647058823529
precision: 0.9210526315789473
accuracy: 0.9041095890410958
f1_score: 0.9090909090909091
116 17 22 17
isolate: kpne-SAMN21366063
recall: 0.8721804511278195
specificity: 0.5641025641025641
precision: 0.8721804511278195
accuracy: 0.8023255813953488
f1_score: 0.8721804511278195
75 10 23 7
isolate: kpne-SAMN21366069
recall: 0.9146341463414634
specificity: 0.69696969