In [18]:
import pandas as pd
import numpy as np

from glob import glob


__E. coli plasgraph model on K. pneumoniae__

In [19]:
species = "k_pneumoniae"
isolate_prefix = "kpne-"

In [20]:
prediction_files = glob("../results/cross_species/ecoli_model_on_kpneumoniae/*" + isolate_prefix + "*.csv")

In [21]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


54 16 58 4
isolate: kpne-SAMN21366037
recall: 0.9310344827586207
specificity: 0.7837837837837838
precision: 0.7714285714285715
accuracy: 0.8484848484848485
f1_score: 0.8437500000000001
11 8 24 1
isolate: kpne-SAMN21366038
recall: 0.9166666666666666
specificity: 0.75
precision: 0.5789473684210527
accuracy: 0.7954545454545454
f1_score: 0.7096774193548387
18 1 26 2
isolate: kpne-SAMN21366043
recall: 0.9
specificity: 0.9629629629629629
precision: 0.9473684210526315
accuracy: 0.9361702127659575
f1_score: 0.9230769230769231
30 3 30 1
isolate: kpne-SAMN21366050
recall: 0.967741935483871
specificity: 0.9090909090909091
precision: 0.9090909090909091
accuracy: 0.9375
f1_score: 0.9374999999999999
13 9 53 10
isolate: kpne-SAMN21366063
recall: 0.5652173913043478
specificity: 0.8548387096774194
precision: 0.5909090909090909
accuracy: 0.7764705882352941
f1_score: 0.5777777777777778
14 5 40 8
isolate: kpne-SAMN21366069
recall: 0.6363636363636364
specificity: 0.8888888888888888
precision: 0.73684210526

In [22]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


66 8 46 12
isolate: kpne-SAMN21366037
recall: 0.8461538461538461
specificity: 0.8518518518518519
precision: 0.8918918918918919
accuracy: 0.8484848484848485
f1_score: 0.868421052631579
27 1 10 6
isolate: kpne-SAMN21366038
recall: 0.8181818181818182
specificity: 0.9090909090909091
precision: 0.9642857142857143
accuracy: 0.8409090909090909
f1_score: 0.8852459016393442
27 2 18 0
isolate: kpne-SAMN21366043
recall: 1.0
specificity: 0.9
precision: 0.9310344827586207
accuracy: 0.9574468085106383
f1_score: 0.9642857142857143
32 1 28 3
isolate: kpne-SAMN21366050
recall: 0.9142857142857143
specificity: 0.9655172413793104
precision: 0.9696969696969697
accuracy: 0.9375
f1_score: 0.9411764705882354
55 12 11 7
isolate: kpne-SAMN21366063
recall: 0.8870967741935484
specificity: 0.4782608695652174
precision: 0.8208955223880597
accuracy: 0.7764705882352941
f1_score: 0.8527131782945736
47 7 11 2
isolate: kpne-SAMN21366069
recall: 0.9591836734693877
specificity: 0.6111111111111112
precision: 0.870370370370

__E. faecium plasgraph on K. pneumoniae__

In [23]:
species = "k_pneumoniae"
isolate_prefix = "kpne-"

In [24]:
prediction_files = glob("../results/cross_species/efaecium_model_on_kpneumoniae/*" + isolate_prefix + "*.csv")

In [25]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


16 2 72 42
isolate: kpne-SAMN21366037
recall: 0.27586206896551724
specificity: 0.972972972972973
precision: 0.8888888888888888
accuracy: 0.6666666666666666
f1_score: 0.42105263157894735
0 2 30 12
isolate: kpne-SAMN21366038
recall: 0.0
specificity: 0.9375
precision: 0.0
accuracy: 0.6818181818181818
f1_score: nan
11 0 27 9
isolate: kpne-SAMN21366043
recall: 0.55
specificity: 1.0
precision: 1.0
accuracy: 0.8085106382978723
f1_score: 0.7096774193548387
27 2 31 4
isolate: kpne-SAMN21366050
recall: 0.8709677419354839
specificity: 0.9393939393939394
precision: 0.9310344827586207
accuracy: 0.90625
f1_score: 0.9
1 6 56 22
isolate: kpne-SAMN21366063
recall: 0.043478260869565216
specificity: 0.9032258064516129
precision: 0.14285714285714285
accuracy: 0.6705882352941176
f1_score: 0.06666666666666667
0 0 45 22
isolate: kpne-SAMN21366069
recall: 0.0
specificity: 1.0
precision: nan
accuracy: 0.6716417910447762
f1_score: nan
16 7 45 20
isolate: kpne-SAMN21366070
recall: 0.4444444444444444
specificity:

In [26]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


77 45 9 1
isolate: kpne-SAMN21366037
recall: 0.9871794871794872
specificity: 0.16666666666666666
precision: 0.6311475409836066
accuracy: 0.6515151515151515
f1_score: 0.77
32 11 0 1
isolate: kpne-SAMN21366038
recall: 0.9696969696969697
specificity: 0.0
precision: 0.7441860465116279
accuracy: 0.7272727272727273
f1_score: 0.8421052631578948
27 18 2 0
isolate: kpne-SAMN21366043
recall: 1.0
specificity: 0.1
precision: 0.6
accuracy: 0.6170212765957447
f1_score: 0.7499999999999999
32 6 23 3
isolate: kpne-SAMN21366050
recall: 0.9142857142857143
specificity: 0.7931034482758621
precision: 0.8421052631578947
accuracy: 0.859375
f1_score: 0.8767123287671234
62 22 1 0
isolate: kpne-SAMN21366063
recall: 1.0
specificity: 0.043478260869565216
precision: 0.7380952380952381
accuracy: 0.7411764705882353
f1_score: 0.8493150684931507
49 18 0 0
isolate: kpne-SAMN21366069
recall: 1.0
specificity: 0.0
precision: 0.7313432835820896
accuracy: 0.7313432835820896
f1_score: 0.8448275862068965
46 27 8 7
isolate: kpn

__E. faecium plasgraph on E. coli__

In [27]:
species = "e_coli_test_set"
isolate_prefix = "ecol-"

In [28]:
prediction_files = glob("../results/cross_species/efaecium_model_on_ecoli/*" + isolate_prefix + "*.csv")

In [29]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


0 0 67 9
isolate: ecol-SAMN15147962
recall: 0.0
specificity: 1.0
precision: nan
accuracy: 0.881578947368421
f1_score: nan
0 0 59 8
isolate: ecol-SAMN15147963
recall: 0.0
specificity: 1.0
precision: nan
accuracy: 0.8805970149253731
f1_score: nan
1 0 122 49
isolate: ecol-SAMN15147970
recall: 0.02
specificity: 1.0
precision: 1.0
accuracy: 0.7151162790697675
f1_score: 0.0392156862745098
2 0 94 18
isolate: ecol-SAMN15147972
recall: 0.1
specificity: 1.0
precision: 1.0
accuracy: 0.8421052631578947
f1_score: 0.18181818181818182
8 32 64 1
isolate: ecol-SAMN15147977
recall: 0.8888888888888888
specificity: 0.6666666666666666
precision: 0.2
accuracy: 0.6857142857142857
f1_score: 0.326530612244898
0 0 52 18
isolate: ecol-SAMN15147983
recall: 0.0
specificity: 1.0
precision: nan
accuracy: 0.7428571428571429
f1_score: nan
0 0 43 1
isolate: ecol-SAMN15147990
recall: 0.0
specificity: 1.0
precision: nan
accuracy: 0.9772727272727273
f1_score: nan
0 0 42 2
isolate: ecol-SAMN15147997
recall: 0.0
specificity

In [30]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


67 9 0 0
isolate: ecol-SAMN15147962
recall: 1.0
specificity: 0.0
precision: 0.881578947368421
accuracy: 0.881578947368421
f1_score: 0.9370629370629371
59 8 0 0
isolate: ecol-SAMN15147963
recall: 1.0
specificity: 0.0
precision: 0.8805970149253731
accuracy: 0.8805970149253731
f1_score: 0.9365079365079364
130 41 1 0
isolate: ecol-SAMN15147970
recall: 1.0
specificity: 0.023809523809523808
precision: 0.7602339181286549
accuracy: 0.7616279069767442
f1_score: 0.8637873754152824
96 16 2 0
isolate: ecol-SAMN15147972
recall: 1.0
specificity: 0.1111111111111111
precision: 0.8571428571428571
accuracy: 0.8596491228070176
f1_score: 0.923076923076923
62 7 1 35
isolate: ecol-SAMN15147977
recall: 0.6391752577319587
specificity: 0.125
precision: 0.8985507246376812
accuracy: 0.6
f1_score: 0.7469879518072289
52 18 0 0
isolate: ecol-SAMN15147983
recall: 1.0
specificity: 0.0
precision: 0.7428571428571429
accuracy: 0.7428571428571429
f1_score: 0.8524590163934427
43 1 0 0
isolate: ecol-SAMN15147990
recall: 1.

__K. pneumoniae plasgraph on E. coli__

In [31]:
species = "e_coli_test_set"
isolate_prefix = "ecol-"

In [32]:
prediction_files = glob("../results/cross_species/kpneumoniae_model_on_ecoli/*" + isolate_prefix + "*.csv")

In [33]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Plasmid"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "plasmid"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


8 10 57 1
isolate: ecol-SAMN15147962
recall: 0.8888888888888888
specificity: 0.8507462686567164
precision: 0.4444444444444444
accuracy: 0.8552631578947368
f1_score: 0.5925925925925926
1 1 58 7
isolate: ecol-SAMN15147963
recall: 0.125
specificity: 0.9830508474576272
precision: 0.5
accuracy: 0.8805970149253731
f1_score: 0.2
34 37 85 16
isolate: ecol-SAMN15147970
recall: 0.68
specificity: 0.6967213114754098
precision: 0.4788732394366197
accuracy: 0.6918604651162791
f1_score: 0.5619834710743802
20 3 91 0
isolate: ecol-SAMN15147972
recall: 1.0
specificity: 0.9680851063829787
precision: 0.8695652173913043
accuracy: 0.9736842105263158
f1_score: 0.9302325581395349
9 18 78 0
isolate: ecol-SAMN15147977
recall: 1.0
specificity: 0.8125
precision: 0.3333333333333333
accuracy: 0.8285714285714286
f1_score: 0.5
6 5 47 12
isolate: ecol-SAMN15147983
recall: 0.3333333333333333
specificity: 0.9038461538461539
precision: 0.5454545454545454
accuracy: 0.7571428571428571
f1_score: 0.41379310344827586
1 1 42 0

In [34]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = "_".join(prediction_file.split("\\")[-1].split("_")[0:-4])

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for contig_name, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[contig_name,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[contig_name,"label"] == "ambiguous":
            #    continue
            if (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["predicted_label"] == "Chromosome"
                or row["predicted_label"] == "Ambiguous"
            ) and not (
                minimap2_df.loc[contig_name, "label"] == "chromosome"
                or minimap2_df.loc[contig_name, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


57 3 6 10
isolate: ecol-SAMN15147962
recall: 0.8507462686567164
specificity: 0.6666666666666666
precision: 0.95
accuracy: 0.8289473684210527
f1_score: 0.8976377952755905
59 8 0 0
isolate: ecol-SAMN15147963
recall: 1.0
specificity: 0.0
precision: 0.8805970149253731
accuracy: 0.8805970149253731
f1_score: 0.9365079365079364
108 15 27 22
isolate: ecol-SAMN15147970
recall: 0.8307692307692308
specificity: 0.6428571428571429
precision: 0.8780487804878049
accuracy: 0.7848837209302325
f1_score: 0.8537549407114625
94 7 11 2
isolate: ecol-SAMN15147972
recall: 0.9791666666666666
specificity: 0.6111111111111112
precision: 0.9306930693069307
accuracy: 0.9210526315789473
f1_score: 0.9543147208121827
89 5 3 8
isolate: ecol-SAMN15147977
recall: 0.9175257731958762
specificity: 0.375
precision: 0.9468085106382979
accuracy: 0.8761904761904762
f1_score: 0.9319371727748691
49 14 4 3
isolate: ecol-SAMN15147983
recall: 0.9423076923076923
specificity: 0.2222222222222222
precision: 0.7777777777777778
accuracy: 