In [1]:
import pandas as pd
import numpy as np

from glob import glob


__C. freundii__

In [2]:
species = "c_freundii"

In [3]:
prediction_files = glob("../results/generalized/" + species + "/*.csv")

In [4]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-3])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_minimap_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous.csv", index=False
    )


0 0 57 0
isolate: c_freundii_001
recall: nan
specificity: 1.0
precision: nan
accuracy: 1.0
f1_score: nan
13 3 141 12
isolate: c_freundii_002
recall: 0.52
specificity: 0.9791666666666666
precision: 0.8125
accuracy: 0.9112426035502958
f1_score: 0.6341463414634146
3 5 131 11
isolate: c_freundii_003
recall: 0.21428571428571427
specificity: 0.9632352941176471
precision: 0.375
accuracy: 0.8933333333333333
f1_score: 0.2727272727272727
0 3 90 4
isolate: c_freundii_004
recall: 0.0
specificity: 0.967741935483871
precision: 0.0
accuracy: 0.9278350515463918
f1_score: nan
0 5 97 0
isolate: c_freundii_005
recall: nan
specificity: 0.9509803921568627
precision: 0.0
accuracy: 0.9509803921568627
f1_score: nan
10 19 146 2
isolate: c_freundii_006
recall: 0.8333333333333334
specificity: 0.8848484848484849
precision: 0.3448275862068966
accuracy: 0.8813559322033898
f1_score: 0.4878048780487806
7 24 152 0
isolate: c_freundii_007
recall: 1.0
specificity: 0.8636363636363636
precision: 0.22580645161290322
accura

__K. oxytoca__

In [5]:
species = "k_oxytoca"

In [6]:
prediction_files = glob("../results/generalized/" + species + "/*.csv")

In [7]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-3])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous.csv", index=False
    )


0 17 135 0
isolate: koxy-SAMN15148173
recall: nan
specificity: 0.8881578947368421
precision: 0.0
accuracy: 0.8881578947368421
f1_score: nan
0 15 121 0
isolate: koxy-SAMN15148174
recall: nan
specificity: 0.8897058823529411
precision: 0.0
accuracy: 0.8897058823529411
f1_score: nan
1 5 150 0
isolate: koxy-SAMN15148287
recall: 1.0
specificity: 0.967741935483871
precision: 0.16666666666666666
accuracy: 0.967948717948718
f1_score: 0.2857142857142857
6 20 162 1
isolate: koxy-SAMN15148459
recall: 0.8571428571428571
specificity: 0.8901098901098901
precision: 0.23076923076923078
accuracy: 0.8888888888888888
f1_score: 0.36363636363636365
18 25 198 4
isolate: koxy-SAMN15148463
recall: 0.8181818181818182
specificity: 0.8878923766816144
precision: 0.4186046511627907
accuracy: 0.8816326530612245
f1_score: 0.5538461538461539
11 13 303 7
isolate: koxy-SAMN15148522
recall: 0.6111111111111112
specificity: 0.9588607594936709
precision: 0.4583333333333333
accuracy: 0.9401197604790419
f1_score: 0.5238095238

__B. megaterium__

In [8]:
species = "b_megaterium"

In [9]:
prediction_files = glob("../results/generalized/" + species + "/*.csv")

In [10]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-3])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous.csv", index=False
    )


17 12 114 17
isolate: pmeg-SAMN16083678
recall: 0.5
specificity: 0.9047619047619048
precision: 0.5862068965517241
accuracy: 0.81875
f1_score: 0.5396825396825397
5 4 134 5
isolate: pmeg-SAMN16083679
recall: 0.5
specificity: 0.9710144927536232
precision: 0.5555555555555556
accuracy: 0.9391891891891891
f1_score: 0.5263157894736842


__S. pseudintermedius__

In [11]:
species = "s_pseudintermedius"

In [12]:
prediction_files = glob("../results/generalized/" + species + "/*.csv")

In [13]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-3])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous.csv", index=False
    )


0 7 51 0
isolate: spse-SAMN10880482
recall: nan
specificity: 0.8793103448275862
precision: 0.0
accuracy: 0.8793103448275862
f1_score: nan
0 5 43 0
isolate: spse-SAMN10880484
recall: nan
specificity: 0.8958333333333334
precision: 0.0
accuracy: 0.8958333333333334
f1_score: nan
0 9 47 4
isolate: spse-SAMN10880486
recall: 0.0
specificity: 0.8392857142857143
precision: 0.0
accuracy: 0.7833333333333333
f1_score: nan


__V. parahaemolyticus__

In [14]:
species = "v_parahaemolyticus"

In [15]:
prediction_files = glob("../results/generalized/" + species + "/*.csv")

In [16]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-3])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous.csv", index=False
    )


9 10 49 0
isolate: vpar-SAMEA5540085
recall: 1.0
specificity: 0.8305084745762712
precision: 0.47368421052631576
accuracy: 0.8529411764705882
f1_score: 0.6428571428571429
0 18 111 0
isolate: vpar-SAMEA5540086
recall: nan
specificity: 0.8604651162790697
precision: 0.0
accuracy: 0.8604651162790697
f1_score: nan
1 11 86 2
isolate: vpar-SAMEA5540087
recall: 0.3333333333333333
specificity: 0.8865979381443299
precision: 0.08333333333333333
accuracy: 0.87
f1_score: 0.13333333333333333
