In [2]:
import pandas as pd
import numpy as np

from glob import glob


In [4]:
species = "e_faecium"

In [5]:
prediction_files = glob("../results/predictions/*" + species+ "*.csv")

In [13]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_minimap_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


0 34 67 0
isolate: E2079
recall: nan
specificity: 0.6633663366336634
precision: 0.0
accuracy: 0.6633663366336634
f1_score: nan
0 0 45 0
isolate: E2364
recall: nan
specificity: 1.0
precision: nan
accuracy: 1.0
f1_score: nan
48 34 115 11
isolate: E4239
recall: 0.8135593220338984
specificity: 0.7718120805369127
precision: 0.5853658536585366
accuracy: 0.7836538461538461
f1_score: 0.6808510638297872
121 15 100 9
isolate: E4457
recall: 0.9307692307692308
specificity: 0.8695652173913043
precision: 0.8897058823529411
accuracy: 0.9020408163265307
f1_score: 0.9097744360902256
69 31 128 7
isolate: E7591
recall: 0.9078947368421053
specificity: 0.8050314465408805
precision: 0.69
accuracy: 0.8382978723404255
f1_score: 0.7840909090909091
105 21 92 8
isolate: E8172
recall: 0.9292035398230089
specificity: 0.8141592920353983
precision: 0.8333333333333334
accuracy: 0.8716814159292036
f1_score: 0.8786610878661087
0 19 87 0
isolate: E9101
recall: nan
specificity: 0.8207547169811321
precision: 0.0
accuracy:

In [12]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_minimap_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


89 0 0 12
isolate: E2079
recall: 0.8811881188118812
specificity: nan
precision: 1.0
accuracy: 0.8811881188118812
f1_score: 0.9368421052631579
44 1 0 0
isolate: E2364
recall: 1.0
specificity: 0.0
precision: 0.9777777777777777
accuracy: 0.9777777777777777
f1_score: 0.9887640449438202
159 17 24 8
isolate: E4239
recall: 0.9520958083832335
specificity: 0.5853658536585366
precision: 0.9034090909090909
accuracy: 0.8798076923076923
f1_score: 0.9271137026239067
126 25 86 8
isolate: E4457
recall: 0.9402985074626866
specificity: 0.7747747747747747
precision: 0.8344370860927153
accuracy: 0.8653061224489796
f1_score: 0.8842105263157896
173 16 34 12
isolate: E7591
recall: 0.9351351351351351
specificity: 0.68
precision: 0.9153439153439153
accuracy: 0.8808510638297873
f1_score: 0.9251336898395722
128 12 75 11
isolate: E8172
recall: 0.920863309352518
specificity: 0.8620689655172413
precision: 0.9142857142857143
accuracy: 0.8982300884955752
f1_score: 0.917562724014337
103 0 0 3
isolate: E9101
recall: 0.