In [4]:
import pandas as pd
import numpy as np

from glob import glob


In [5]:
species = "e_coli"

In [23]:
prediction_files = glob("../results/mlplasmids_predictions/*" + species + "*.csv")

In [20]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]
    

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "_test_set/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )


    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


7 2 117 14
isolate: ecol-SAMN15147962
recall: 0.3333333333333333
specificity: 0.9831932773109243
precision: 0.7777777777777778
accuracy: 0.8857142857142857
f1_score: 0.4666666666666666
10 4 108 8
isolate: ecol-SAMN15147963
recall: 0.5555555555555556
specificity: 0.9642857142857143
precision: 0.7142857142857143
accuracy: 0.9076923076923077
f1_score: 0.6250000000000001
32 12 260 92
isolate: ecol-SAMN15147970
recall: 0.25806451612903225
specificity: 0.9558823529411765
precision: 0.7272727272727273
accuracy: 0.7373737373737373
f1_score: 0.38095238095238093
10 9 141 29
isolate: ecol-SAMN15147972
recall: 0.2564102564102564
specificity: 0.94
precision: 0.5263157894736842
accuracy: 0.798941798941799
f1_score: 0.3448275862068965
3 7 152 14
isolate: ecol-SAMN15147977
recall: 0.17647058823529413
specificity: 0.9559748427672956
precision: 0.3
accuracy: 0.8806818181818182
f1_score: 0.22222222222222224
16 2 106 24
isolate: ecol-SAMN15147983
recall: 0.4
specificity: 0.9814814814814815
precision: 0.88

In [24]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "_test_set/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


126 5 6 3
isolate: ecol-SAMN15147962
recall: 0.9767441860465116
specificity: 0.5454545454545454
precision: 0.9618320610687023
accuracy: 0.9428571428571428
f1_score: 0.9692307692307692
112 4 9 5
isolate: ecol-SAMN15147963
recall: 0.9572649572649573
specificity: 0.6923076923076923
precision: 0.9655172413793104
accuracy: 0.9307692307692308
f1_score: 0.9613733905579399
289 63 30 14
isolate: ecol-SAMN15147970
recall: 0.9537953795379538
specificity: 0.3225806451612903
precision: 0.8210227272727273
accuracy: 0.8055555555555556
f1_score: 0.8824427480916032
151 19 9 10
isolate: ecol-SAMN15147972
recall: 0.937888198757764
specificity: 0.32142857142857145
precision: 0.888235294117647
accuracy: 0.8465608465608465
f1_score: 0.9123867069486403
159 7 2 8
isolate: ecol-SAMN15147977
recall: 0.9520958083832335
specificity: 0.2222222222222222
precision: 0.9578313253012049
accuracy: 0.9147727272727273
f1_score: 0.954954954954955
112 18 13 5
isolate: ecol-SAMN15147983
recall: 0.9572649572649573
specificity