In [1]:
import pandas as pd
import numpy as np

from glob import glob


__E. coli mlplasmids model on K. pneumoniae__

In [2]:
species = "k_pneumoniae"
isolate_prefix = "kpne-"

In [3]:
prediction_files = glob("../results/mlplasmids_cross_species/ecoli_model_on_kpneumoniae/*" + isolate_prefix + "*.csv")

In [4]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]
    

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )


    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )

25 30 46 33
isolate: kpne-SAMN21366037
recall: 0.43103448275862066
specificity: 0.6052631578947368
precision: 0.45454545454545453
accuracy: 0.5298507462686567
f1_score: 0.44247787610619466
5 13 23 7
isolate: kpne-SAMN21366038
recall: 0.4166666666666667
specificity: 0.6388888888888888
precision: 0.2777777777777778
accuracy: 0.5833333333333334
f1_score: 0.33333333333333337
10 6 21 10
isolate: kpne-SAMN21366043
recall: 0.5
specificity: 0.7777777777777778
precision: 0.625
accuracy: 0.6595744680851063
f1_score: 0.5555555555555556
14 16 29 19
isolate: kpne-SAMN21366050
recall: 0.42424242424242425
specificity: 0.6444444444444445
precision: 0.4666666666666667
accuracy: 0.5512820512820513
f1_score: 0.4444444444444445
15 23 39 8
isolate: kpne-SAMN21366063
recall: 0.6521739130434783
specificity: 0.6290322580645161
precision: 0.39473684210526316
accuracy: 0.6352941176470588
f1_score: 0.49180327868852464
15 18 29 7
isolate: kpne-SAMN21366069
recall: 0.6818181818181818
specificity: 0.617021276595744

In [5]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


48 31 23 32
isolate: kpne-SAMN21366037
recall: 0.6
specificity: 0.42592592592592593
precision: 0.6075949367088608
accuracy: 0.5298507462686567
f1_score: 0.6037735849056604
24 6 5 13
isolate: kpne-SAMN21366038
recall: 0.6486486486486487
specificity: 0.45454545454545453
precision: 0.8
accuracy: 0.6041666666666666
f1_score: 0.7164179104477612
21 10 10 6
isolate: kpne-SAMN21366043
recall: 0.7777777777777778
specificity: 0.5
precision: 0.6774193548387096
accuracy: 0.6595744680851063
f1_score: 0.7241379310344828
30 18 13 17
isolate: kpne-SAMN21366050
recall: 0.6382978723404256
specificity: 0.41935483870967744
precision: 0.625
accuracy: 0.5512820512820513
f1_score: 0.631578947368421
39 8 15 23
isolate: kpne-SAMN21366063
recall: 0.6290322580645161
specificity: 0.6521739130434783
precision: 0.8297872340425532
accuracy: 0.6352941176470588
f1_score: 0.7155963302752293
31 5 13 20
isolate: kpne-SAMN21366069
recall: 0.6078431372549019
specificity: 0.7222222222222222
precision: 0.8611111111111112
acc

__E. faecium mlpasmids on K. pneumoniae__

In [6]:
species = "k_pneumoniae"
isolate_prefix = "kpne-"

In [7]:
prediction_files = glob("../results/mlplasmids_cross_species/efaecium_model_on_kpneumoniae/*" + isolate_prefix + "*.csv")

In [8]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]
    

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )


    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


20 34 42 38
isolate: kpne-SAMN21366037
recall: 0.3448275862068966
specificity: 0.5526315789473685
precision: 0.37037037037037035
accuracy: 0.4626865671641791
f1_score: 0.35714285714285715
5 18 18 7
isolate: kpne-SAMN21366038
recall: 0.4166666666666667
specificity: 0.5
precision: 0.21739130434782608
accuracy: 0.4791666666666667
f1_score: 0.2857142857142857
11 16 11 9
isolate: kpne-SAMN21366043
recall: 0.55
specificity: 0.4074074074074074
precision: 0.4074074074074074
accuracy: 0.46808510638297873
f1_score: 0.46808510638297873
18 22 23 15
isolate: kpne-SAMN21366050
recall: 0.5454545454545454
specificity: 0.5111111111111111
precision: 0.45
accuracy: 0.5256410256410257
f1_score: 0.4931506849315069
6 32 30 17
isolate: kpne-SAMN21366063
recall: 0.2608695652173913
specificity: 0.4838709677419355
precision: 0.15789473684210525
accuracy: 0.4235294117647059
f1_score: 0.19672131147540983
11 21 26 11
isolate: kpne-SAMN21366069
recall: 0.5
specificity: 0.5531914893617021
precision: 0.34375
accuracy

In [9]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


44 36 18 36
isolate: kpne-SAMN21366037
recall: 0.55
specificity: 0.3333333333333333
precision: 0.55
accuracy: 0.4626865671641791
f1_score: 0.55
19 6 5 18
isolate: kpne-SAMN21366038
recall: 0.5135135135135135
specificity: 0.45454545454545453
precision: 0.76
accuracy: 0.5
f1_score: 0.6129032258064517
11 9 11 16
isolate: kpne-SAMN21366043
recall: 0.4074074074074074
specificity: 0.55
precision: 0.55
accuracy: 0.46808510638297873
f1_score: 0.46808510638297873
24 14 17 23
isolate: kpne-SAMN21366050
recall: 0.5106382978723404
specificity: 0.5483870967741935
precision: 0.631578947368421
accuracy: 0.5256410256410257
f1_score: 0.5647058823529411
30 17 6 32
isolate: kpne-SAMN21366063
recall: 0.4838709677419355
specificity: 0.2608695652173913
precision: 0.6382978723404256
accuracy: 0.4235294117647059
f1_score: 0.5504587155963303
28 9 9 23
isolate: kpne-SAMN21366069
recall: 0.5490196078431373
specificity: 0.5
precision: 0.7567567567567568
accuracy: 0.5362318840579711
f1_score: 0.6363636363636364
26

__E. faecium mlplasmids on E. coli__

In [10]:
species = "e_coli_test_set"
isolate_prefix = "ecol-"

In [11]:
prediction_files = glob("../results/mlplasmids_cross_species/efaecium_model_on_ecoli/*" + isolate_prefix + "*.csv")

In [12]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]
    

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )


    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


0 21 47 9
isolate: ecol-SAMN15147962
recall: 0.0
specificity: 0.6911764705882353
precision: 0.0
accuracy: 0.6103896103896104
f1_score: nan
0 19 41 8
isolate: ecol-SAMN15147963
recall: 0.0
specificity: 0.6833333333333333
precision: 0.0
accuracy: 0.6029411764705882
f1_score: nan
21 48 74 29
isolate: ecol-SAMN15147970
recall: 0.42
specificity: 0.6065573770491803
precision: 0.30434782608695654
accuracy: 0.5523255813953488
f1_score: 0.3529411764705883
9 41 53 11
isolate: ecol-SAMN15147972
recall: 0.45
specificity: 0.5638297872340425
precision: 0.18
accuracy: 0.543859649122807
f1_score: 0.2571428571428572
5 35 61 4
isolate: ecol-SAMN15147977
recall: 0.5555555555555556
specificity: 0.6354166666666666
precision: 0.125
accuracy: 0.6285714285714286
f1_score: 0.20408163265306123
2 21 31 16
isolate: ecol-SAMN15147983
recall: 0.1111111111111111
specificity: 0.5961538461538461
precision: 0.08695652173913043
accuracy: 0.4714285714285714
f1_score: 0.0975609756097561
0 17 26 1
isolate: ecol-SAMN1514799

In [13]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


47 9 0 21
isolate: ecol-SAMN15147962
recall: 0.6911764705882353
specificity: 0.0
precision: 0.8392857142857143
accuracy: 0.6103896103896104
f1_score: 0.7580645161290324
41 8 0 19
isolate: ecol-SAMN15147963
recall: 0.6833333333333333
specificity: 0.0
precision: 0.8367346938775511
accuracy: 0.6029411764705882
f1_score: 0.7522935779816514
80 23 19 50
isolate: ecol-SAMN15147970
recall: 0.6153846153846154
specificity: 0.4523809523809524
precision: 0.7766990291262136
accuracy: 0.5755813953488372
f1_score: 0.6866952789699571
55 9 9 41
isolate: ecol-SAMN15147972
recall: 0.5729166666666666
specificity: 0.5
precision: 0.859375
accuracy: 0.5614035087719298
f1_score: 0.6875
61 4 4 36
isolate: ecol-SAMN15147977
recall: 0.6288659793814433
specificity: 0.5
precision: 0.9384615384615385
accuracy: 0.6190476190476191
f1_score: 0.7530864197530864
33 14 4 19
isolate: ecol-SAMN15147983
recall: 0.6346153846153846
specificity: 0.2222222222222222
precision: 0.7021276595744681
accuracy: 0.5285714285714286
f1_s

__K. pneumoniae mlplasmids on E. coli__

In [14]:
species = "e_coli_test_set"
isolate_prefix = "ecol-"

In [15]:
prediction_files = glob("../results/mlplasmids_cross_species/kpneumoniae_model_on_ecoli/*" + isolate_prefix + "*.csv")

In [16]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]
    

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )


    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Plasmid"
            ) and not (
                minimap2_df.loc[index, "label"] == "plasmid"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_plasmid.csv", index=False
    )


8 17 51 1
isolate: ecol-SAMN15147962
recall: 0.8888888888888888
specificity: 0.75
precision: 0.32
accuracy: 0.7662337662337663
f1_score: 0.47058823529411764
8 11 49 0
isolate: ecol-SAMN15147963
recall: 1.0
specificity: 0.8166666666666667
precision: 0.42105263157894735
accuracy: 0.8382352941176471
f1_score: 0.5925925925925926
49 46 76 1
isolate: ecol-SAMN15147970
recall: 0.98
specificity: 0.6229508196721312
precision: 0.5157894736842106
accuracy: 0.7267441860465116
f1_score: 0.6758620689655174
18 24 70 2
isolate: ecol-SAMN15147972
recall: 0.9
specificity: 0.7446808510638298
precision: 0.42857142857142855
accuracy: 0.7719298245614035
f1_score: 0.5806451612903225
9 23 73 0
isolate: ecol-SAMN15147977
recall: 1.0
specificity: 0.7604166666666666
precision: 0.28125
accuracy: 0.780952380952381
f1_score: 0.43902439024390244
17 12 40 1
isolate: ecol-SAMN15147983
recall: 0.9444444444444444
specificity: 0.7692307692307693
precision: 0.5862068965517241
accuracy: 0.8142857142857143
f1_score: 0.72340

In [17]:
# metrics using all input sequences

for prediction_file in prediction_files:

    if "metrics" in prediction_file:
        continue

    prediction_df = pd.read_csv(prediction_file, index_col=0)

    isolate = prediction_file.split("\\")[-1].split("_")[0]

    minimap2_df = pd.read_csv(
        "../data/alignment_files/"
        + species
        + "/"
        + isolate
        + "_alignment_labelled_ambiguity_cutoff_1.csv",
        index_col=0,
    )

    tp = 0
    fp = 0
    tn = 0
    fn = 0

    for index, row in prediction_df.iterrows():
        try:  # not all sequences in the graph file are also in the assembly fasta, leading to key error
            # if minimap2_df.loc[index,"short_read_contig_length"] <= 1000:
            #    continue
            # if minimap2_df.loc[index,"label"] == "ambiguous":
            #    continue
            if (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tp += 1
            elif (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fp += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                fn += 1
            elif not (
                row["Prediction"] == "Chromosome"
            ) and not (
                minimap2_df.loc[index, "label"] == "chromosome"
                or minimap2_df.loc[index, "label"] == "ambiguous"
            ):
                tn += 1
        except KeyError:
            pass

    print(tp, fp, tn, fn)

    try:
        recall = tp / (tp + fn)  # TP / P = TP / (TP + FN)
    except ZeroDivisionError:
        recall = np.nan
    try:
        specificity = tn / (tn + fp)  # TN / N = TN / (TN + FP)
    except ZeroDivisionError:
        specificity = np.nan
    try:
        precision = tp / (tp + fp)  # TP / (TP + FP)
    except ZeroDivisionError:
        precision = np.nan
    accuracy = (tp + tn) / (
        tp + tn + fp + fn
    )  # T / all = (TP + TN) / (TP + TN + FP + FN)
    try:
        f1_score = 2 * (
            (recall * precision) / (recall + precision)
        )  # harmonic mean of precision and recall
    except ZeroDivisionError:
        f1_score = np.nan

    print("isolate:", isolate)
    print("recall:", recall)
    print("specificity:", specificity)
    print("precision:", precision)
    print("accuracy:", accuracy)
    print("f1_score:", f1_score)

    result_df = pd.DataFrame(
        [[isolate, recall, specificity, precision, accuracy, f1_score, tp, fp, fn, tn]],
        columns=[
            "isolate",
            "recall",
            "specificity",
            "precision",
            "accuracy",
            "f1_score",
            "tp",
            "fp",
            "fn",
            "tn",
        ],
    )

    result_df.to_csv(
        prediction_file[:-4] + "_metrics_all_ambiguous_equal_chromosome.csv",
        index=False,
    )


51 1 8 17
isolate: ecol-SAMN15147962
recall: 0.75
specificity: 0.8888888888888888
precision: 0.9807692307692307
accuracy: 0.7662337662337663
f1_score: 0.8499999999999999
49 0 8 11
isolate: ecol-SAMN15147963
recall: 0.8166666666666667
specificity: 1.0
precision: 1.0
accuracy: 0.8382352941176471
f1_score: 0.8990825688073394
76 1 41 54
isolate: ecol-SAMN15147970
recall: 0.5846153846153846
specificity: 0.9761904761904762
precision: 0.987012987012987
accuracy: 0.6802325581395349
f1_score: 0.7342995169082125
70 2 16 26
isolate: ecol-SAMN15147972
recall: 0.7291666666666666
specificity: 0.8888888888888888
precision: 0.9722222222222222
accuracy: 0.7543859649122807
f1_score: 0.8333333333333333
73 0 8 24
isolate: ecol-SAMN15147977
recall: 0.7525773195876289
specificity: 1.0
precision: 1.0
accuracy: 0.7714285714285715
f1_score: 0.8588235294117647
40 1 17 12
isolate: ecol-SAMN15147983
recall: 0.7692307692307693
specificity: 0.9444444444444444
precision: 0.975609756097561
accuracy: 0.814285714285714