In [7]:
import pandas as pd
import numpy as np

from glob import glob


In [8]:
species = "k_pneumoniae"

In [9]:
prediction_files = glob("../results/mlplasmids_predictions/*" + species + "*.csv")

In [12]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]
    

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )


    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


170 29 69 2
isolate: kpne-SAMN21366037
recall: 0.9883720930232558
specificity: 0.7040816326530612
precision: 0.8542713567839196
accuracy: 0.8851851851851852
f1_score: 0.9164420485175203
22 39 32 0
isolate: kpne-SAMN21366038
recall: 1.0
specificity: 0.4507042253521127
precision: 0.36065573770491804
accuracy: 0.5806451612903226
f1_score: 0.5301204819277109
47 40 28 2
isolate: kpne-SAMN21366043
recall: 0.9591836734693877
specificity: 0.4117647058823529
precision: 0.5402298850574713
accuracy: 0.6410256410256411
f1_score: 0.6911764705882353
37 7 42 1
isolate: kpne-SAMN21366050
recall: 0.9736842105263158
specificity: 0.8571428571428571
precision: 0.8409090909090909
accuracy: 0.9080459770114943
f1_score: 0.9024390243902439
37 72 56 3
isolate: kpne-SAMN21366063
recall: 0.925
specificity: 0.4375
precision: 0.3394495412844037
accuracy: 0.5535714285714286
f1_score: 0.4966442953020134
43 26 46 2
isolate: kpne-SAMN21366069
recall: 0.9555555555555556
specificity: 0.6388888888888888
precision: 0.6231

In [13]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


69 2 135 64
isolate: kpne-SAMN21366037
recall: 0.518796992481203
specificity: 0.9854014598540146
precision: 0.971830985915493
accuracy: 0.7555555555555555
f1_score: 0.6764705882352942
32 0 18 43
isolate: kpne-SAMN21366038
recall: 0.4266666666666667
specificity: 1.0
precision: 1.0
accuracy: 0.5376344086021505
f1_score: 0.5981308411214953
28 2 39 48
isolate: kpne-SAMN21366043
recall: 0.3684210526315789
specificity: 0.9512195121951219
precision: 0.9333333333333333
accuracy: 0.5726495726495726
f1_score: 0.5283018867924527
42 1 35 9
isolate: kpne-SAMN21366050
recall: 0.8235294117647058
specificity: 0.9722222222222222
precision: 0.9767441860465116
accuracy: 0.8850574712643678
f1_score: 0.8936170212765957
57 2 36 73
isolate: kpne-SAMN21366063
recall: 0.43846153846153846
specificity: 0.9473684210526315
precision: 0.9661016949152542
accuracy: 0.5535714285714286
f1_score: 0.6031746031746031
47 1 32 37
isolate: kpne-SAMN21366069
recall: 0.5595238095238095
specificity: 0.9696969696969697
precision