In [1]:
import pandas as pd
import numpy as np

In [2]:
import allel # ref - https://scikit-allel.readthedocs.io/en/stable/io.html#variant-call-format-vcf

In [3]:
vcf_folder = "../dataset/"
vcf_file_freebayes = "syn2-freebayes.vcf.gz"
vcf_file_mutect = "syn2-mutect2.vcf.gz"
vcf_file_vardict = "syn2-vardict.vcf.gz"
vcf_file_varscan = "syn2-varscan.vcf.gz"

In [4]:
df_freebayes = allel.vcf_to_dataframe(vcf_folder + vcf_file_freebayes)

In [5]:
df_freebayes.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10177,rs367896724,A,AC,,,317.100006,False
1,1,10352,rs555500075;rs145072688,T,TA,,,436.5,False
2,1,10583,rs58108140,G,A,,,99.0,False
3,1,10622,.,T,G,,,26.200001,False
4,1,10649,.,G,A,,,19.1,False


In [6]:
df_mutect = allel.vcf_to_dataframe(vcf_folder + vcf_file_mutect)
df_mutect.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,137172,.,T,G,,,,True
1,1,237752,rs374471060,A,G,,,,False
2,1,237762,.,G,A,,,,False
3,1,355088,.,TCTCCCCTG,T,,,,False
4,1,355100,.,A,C,,,,False


In [7]:
df_vardict = allel.vcf_to_dataframe(vcf_folder + vcf_file_vardict)
df_vardict.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10177,rs367896724,A,AC,,,159.0,False
1,1,10231,rs200279319,C,A,,,114.0,False
2,1,10329,rs150969722,AC,A,,,146.0,False
3,1,10519,rs62636508,G,C,,,60.0,False
4,1,10583,rs58108140,G,A,,,92.0,False


In [8]:
df_varscan = allel.vcf_to_dataframe(vcf_folder + vcf_file_varscan)
df_varscan.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10177,rs367896724,A,AC,,,0.0,False
1,1,10230,rs775928745,AC,A,,,0.0,False
2,1,10231,rs200279319,C,A,,,0.0,False
3,1,10234,rs145599635,C,T,,,0.0,False
4,1,10247,.,TA,T,,,0.0,False


In [9]:
print(f"Real freebayes shape = {df_freebayes.shape}")
print(f"Real Mutect shape = {df_mutect.shape}")
print(f"Real Vardict shape = {df_vardict.shape}")
print(f"Real Varscan shape = {df_varscan.shape}")

Real freebayes shape = (4986115, 9)
Real Mutect shape = (114403, 9)
Real Vardict shape = (5002549, 9)
Real Varscan shape = (4753923, 9)


In [10]:
m1 = pd.merge(df_freebayes, df_mutect, on = ["CHROM", "POS"], how="outer", suffixes = ("_freebayes", "_mutect"))

In [11]:
m1.shape

(5088347, 16)

In [12]:
m2 = pd.merge(m1, df_vardict, on = ["CHROM", "POS"], how="outer")
m2.rename(columns={"ID": "ID_vardict", "REF": "REF_vardict", "ALT_1": "ALT_1_vardict", "ALT_2": "ALT_2_vardict", \
                  "ALT_3": "ALT_3_vardict", "QUAL": "QUAL_vardict", "FILTER_PASS": "FILTER_PASS_vardict"}, inplace=True)

In [13]:
m2.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_mutect,QUAL_mutect,FILTER_PASS_mutect,ID_vardict,REF_vardict,ALT_1_vardict,ALT_2_vardict,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict
0,1,10177,rs367896724,A,AC,,,317.100006,False,,...,,,,rs367896724,A,AC,,,159.0,False
1,1,10352,rs555500075;rs145072688,T,TA,,,436.5,False,,...,,,,,,,,,,
2,1,10583,rs58108140,G,A,,,99.0,False,,...,,,,rs58108140,G,A,,,92.0,False
3,1,10622,.,T,G,,,26.200001,False,,...,,,,,,,,,,
4,1,10649,.,G,A,,,19.1,False,,...,,,,,,,,,,


In [14]:
m2.shape

(5480261, 23)

In [15]:
m3 = pd.merge(m2, df_varscan, on = ["CHROM", "POS"], how="outer")
m3.rename(columns={"ID": "ID_varscan", "REF": "REF_varscan", "ALT_1": "ALT_1_varscan", "ALT_2": "ALT_2_varscan", \
                  "ALT_3": "ALT_3_varscan", "QUAL": "QUAL_varscan", "FILTER_PASS": "FILTER_PASS_varscan"}, inplace=True)
m3.shape

(5783210, 30)

In [16]:
m3.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,10177,rs367896724,A,AC,,,317.100006,False,,...,,159.0,False,rs367896724,A,AC,,,0.0,False
1,1,10352,rs555500075;rs145072688,T,TA,,,436.5,False,,...,,,,rs555500075;rs145072688,T,TA,,,0.0,False
2,1,10583,rs58108140,G,A,,,99.0,False,,...,,92.0,False,,,,,,,
3,1,10622,.,T,G,,,26.200001,False,,...,,,,,,,,,,
4,1,10649,.,G,A,,,19.1,False,,...,,,,,,,,,,


In [17]:
m3.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,1,10177,False,,False,False
1,1,10352,False,,,False
2,1,10583,False,,False,
3,1,10622,False,,,
4,1,10649,False,,,


In [18]:
df_merged = m3

In [19]:
# add third category instead of False?
df_merged["FILTER_PASS_freebayes"].fillna(False, inplace=True)
df_merged["FILTER_PASS_vardict"].fillna(False, inplace=True)
df_merged["FILTER_PASS_mutect"].fillna(False, inplace=True)
df_merged["FILTER_PASS_varscan"].fillna(False, inplace=True)
df_merged.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,1,10177,False,False,False,False
1,1,10352,False,False,False,False
2,1,10583,False,False,False,False
3,1,10622,False,False,False,False
4,1,10649,False,False,False,False


In [20]:
truth = pd.read_csv(vcf_folder + "syn2_truth.bed", sep = "\t", header = None)
truth.columns = ["CHROM", "POS_START", "POS_END"]
truth.head()

Unnamed: 0,CHROM,POS_START,POS_END
0,1,1787808,1787808
1,1,1819379,1819379
2,1,2115269,2115269
3,1,2422541,2422541
4,1,2494323,2494323


In [21]:
(truth.POS_START == truth.POS_END).sum()

4332

In [22]:
truth.shape

(4332, 3)

In [23]:
truth.merge(df_merged, left_on = ["CHROM", "POS_START"], right_on = ["CHROM", "POS"])
# this is smaller than the actual truth file indicating the all 4 variant callers missed to classify some true variants

Unnamed: 0,CHROM,POS_START,POS_END,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,1787808,1787808,1787808,.,C,T,,,339.399994,...,,143.0,True,.,C,T,,,0.0,True
1,1,1819379,1819379,1819379,rs968266291,A,C,,,1611.099976,...,,,False,rs968266291,A,C,,,0.0,True
2,1,2115269,2115269,2115269,.,A,T,,,210.100006,...,,118.0,True,.,A,T,,,0.0,True
3,1,2422541,2422541,2422541,.,G,C,,,404.100006,...,,148.0,True,.,G,C,,,0.0,True
4,1,2494323,2494323,2494323,rs556710728,C,T,,,220.600006,...,,115.0,True,rs556710728,C,T,,,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4309,X,153631641,153631641,153631641,.,A,T,,,338.399994,...,,141.0,True,.,A,T,,,0.0,True
4310,X,154104082,154104082,154104082,.,A,T,,,68.300003,...,,87.0,True,.,A,T,,,0.0,True
4311,X,154417237,154417237,154417237,.,A,C,,,242.100006,...,,124.0,True,.,A,C,,,0.0,False
4312,X,154795622,154795622,154795622,.,G,C,,,307.200012,...,,139.0,True,.,G,C,,,0.0,True


In [24]:
df_merged_with_truth = pd.merge(truth, df_merged, left_on=["CHROM", "POS_START"], right_on = ["CHROM", "POS"], how="outer")
df_merged_with_truth.head()

Unnamed: 0,CHROM,POS_START,POS_END,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,1787808.0,1787808.0,1787808.0,.,C,T,,,339.399994,...,,143.0,True,.,C,T,,,0.0,True
1,1,1819379.0,1819379.0,1819379.0,rs968266291,A,C,,,1611.099976,...,,,False,rs968266291,A,C,,,0.0,True
2,1,2115269.0,2115269.0,2115269.0,.,A,T,,,210.100006,...,,118.0,True,.,A,T,,,0.0,True
3,1,2422541.0,2422541.0,2422541.0,.,G,C,,,404.100006,...,,148.0,True,.,G,C,,,0.0,True
4,1,2494323.0,2494323.0,2494323.0,rs556710728,C,T,,,220.600006,...,,115.0,True,rs556710728,C,T,,,0.0,True


In [25]:
df_merged_with_truth.shape

(5783228, 32)

In [26]:
def set_true_label(row):
    if str(row) == "nan":
        return False
    else:
        return True

In [27]:
df_merged_with_truth["truth"] = df_merged_with_truth["POS_START"].apply(lambda row:set_true_label(row))

In [28]:
extras_in_truth = df_merged_with_truth[df_merged_with_truth.POS.isna()].index
extras_in_truth

Int64Index([ 169,  213,  398,  482, 1007, 1009, 1215, 1951, 2283, 2397, 3029,
            3240, 3543, 3980, 3981, 3984, 4205, 4206],
           dtype='int64')

In [29]:
df_merged_with_truth.drop(extras_in_truth, inplace=True)

In [30]:
df_merged_with_truth.isna().sum()

CHROM                          0
POS_START                5778896
POS_END                  5778896
POS                            0
ID_freebayes              797095
REF_freebayes             797095
ALT_1_freebayes           797095
ALT_2_freebayes          5706825
ALT_3_freebayes          5778167
QUAL_freebayes            797101
FILTER_PASS_freebayes          0
ID_mutect                5668742
REF_mutect               5668742
ALT_1_mutect             5668742
ALT_2_mutect             5783210
ALT_3_mutect             5783210
QUAL_mutect              5783210
FILTER_PASS_mutect             0
ID_vardict                761313
REF_vardict               761313
ALT_1_vardict             761313
ALT_2_vardict            5783210
ALT_3_vardict            5783210
QUAL_vardict              761417
FILTER_PASS_vardict            0
ID_varscan               1012561
REF_varscan              1012561
ALT_1_varscan            1012561
ALT_2_varscan            5783210
ALT_3_varscan            5783210
QUAL_varsc

In [31]:
df_merged_with_truth["QUAL_freebayes"].fillna(df_merged_with_truth["QUAL_freebayes"].mean(), inplace=True)
df_merged_with_truth["QUAL_vardict"].fillna(df_merged_with_truth["QUAL_vardict"].mean(), inplace=True)

In [32]:
df_merged_with_truth["QUAL_mutect"].isna().sum()

5783210

In [33]:
df_merged_with_truth[["QUAL_freebayes", "QUAL_vardict"]].corr()

Unnamed: 0,QUAL_freebayes,QUAL_vardict
QUAL_freebayes,1.0,0.457193
QUAL_vardict,0.457193,1.0


#### Confusion matrix comparing against truth

In [34]:
from sklearn.metrics import confusion_matrix

In [35]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_freebayes)).ravel()
tn, fp, fn, tp

(5768656, 10240, 112, 4202)

In [36]:
# least false positives
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_mutect)).ravel()
tn, fp, fn, tp

(5768887, 10009, 58, 4256)

In [37]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_vardict)).ravel()
tn, fp, fn, tp

(5757671, 21225, 70, 4244)

In [38]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_varscan)).ravel()
tn, fp, fn, tp

(5725010, 53886, 1375, 2939)

In [39]:
def predict_label_three_best(row):
    if row["FILTER_PASS_mutect"] == False:
        return False
    if int(row["FILTER_PASS_freebayes"]) + int(row["FILTER_PASS_vardict"]) + int(row["FILTER_PASS_varscan"]) >= 2:
        return True
    return False

In [40]:
df_merged_with_truth["three_best_mutect_true"] = df_merged_with_truth[["FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]].apply(predict_label_three_best, axis = 1)

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_merged_with_truth[["FILTER_PASS_mutect", "FILTER_PASS_freebayes", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]], df_merged_with_truth["truth"], test_size = 0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4626568, 4)
(1156642, 4)
(4626568,)
(1156642,)


In [42]:
from pgmpy.models import BayesianNetwork

In [43]:
bn = BayesianNetwork(
[
#     ("CHROM", "truth"),
#     ("POS", "truth"),
#     ("QUAL_freebayes", "truth"),
#     ("QUAL_vardict", "truth"),
    ("FILTER_PASS_mutect", "truth"),
    ("FILTER_PASS_freebayes", "truth"),
    ("FILTER_PASS_vardict", "truth"),
    ("FILTER_PASS_varscan", "truth"),
])

In [44]:
from pgmpy.estimators import MaximumLikelihoodEstimator

In [45]:
train_df = pd.DataFrame(X_train, columns = ["FILTER_PASS_mutect","FILTER_PASS_freebayes", "FILTER_PASS_vardict", "FILTER_PASS_varscan"])
train_df["truth"] = y_train
print(train_df.shape)
train_df.head()

(4626568, 5)


Unnamed: 0,FILTER_PASS_mutect,FILTER_PASS_freebayes,FILTER_PASS_vardict,FILTER_PASS_varscan,truth
2187294,False,False,False,False,False
3817528,False,False,False,False,False
1241520,False,False,False,False,False
1563987,False,False,False,False,False
3695583,False,False,False,False,False


In [46]:
bn.fit(
    data=train_df,
    estimator=MaximumLikelihoodEstimator
)

In [47]:
print(bn.get_cpds("truth"))

+-----------------------+-----+-----------------------------+
| FILTER_PASS_freebayes | ... | FILTER_PASS_freebayes(True) |
+-----------------------+-----+-----------------------------+
| FILTER_PASS_mutect    | ... | FILTER_PASS_mutect(True)    |
+-----------------------+-----+-----------------------------+
| FILTER_PASS_vardict   | ... | FILTER_PASS_vardict(True)   |
+-----------------------+-----+-----------------------------+
| FILTER_PASS_varscan   | ... | FILTER_PASS_varscan(True)   |
+-----------------------+-----+-----------------------------+
| truth(False)          | ... | 0.021120689655172414        |
+-----------------------+-----+-----------------------------+
| truth(True)           | ... | 0.9788793103448276          |
+-----------------------+-----+-----------------------------+


In [48]:
test_df = pd.DataFrame(X_test, columns = ["FILTER_PASS_mutect","FILTER_PASS_freebayes", "FILTER_PASS_vardict", "FILTER_PASS_varscan"])
test_df["truth"] = y_test
print(test_df.shape)
test_df.head()

(1156642, 5)


Unnamed: 0,FILTER_PASS_mutect,FILTER_PASS_freebayes,FILTER_PASS_vardict,FILTER_PASS_varscan,truth
2043465,False,False,False,False,False
296498,False,False,False,False,False
5488717,False,False,False,False,False
2020585,False,False,False,False,False
2066775,False,False,False,False,False


In [49]:
from pgmpy.inference import VariableElimination
from ipywidgets import FloatProgress

In [50]:
bn_infer = VariableElimination(bn)

In [51]:
from IPython.display import clear_output

In [52]:
def get_preds(row):
    prob = bn_infer.query(variables=["truth"], evidence={"FILTER_PASS_freebayes": row["FILTER_PASS_freebayes"], \
                                             "FILTER_PASS_mutect": row["FILTER_PASS_mutect"], \
                                             "FILTER_PASS_vardict": row["FILTER_PASS_vardict"], \
                                             "FILTER_PASS_varscan": row["FILTER_PASS_varscan"]}, show_progress=False)
#     clear_output(wait=True)
    if (prob.values[1]) > 0.5:
        return True
    else:
        return False

In [53]:
test_df["preds"] = test_df.apply(get_preds, axis = 1)

#### Metric evaluation

In [54]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [55]:
from sklearn.metrics import classification_report

In [56]:
print(classification_report(df_merged_with_truth["truth"], df_merged_with_truth["three_best_mutect_true"]))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   5778896
        True       0.88      0.97      0.93      4314

    accuracy                           1.00   5783210
   macro avg       0.94      0.99      0.96   5783210
weighted avg       1.00      1.00      1.00   5783210



In [57]:
print(classification_report(test_df["truth"], test_df["preds"]))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1155762
        True       0.92      0.97      0.95       880

    accuracy                           1.00   1156642
   macro avg       0.96      0.99      0.97   1156642
weighted avg       1.00      1.00      1.00   1156642



In [58]:
df_merged_with_truth.shape

(5783210, 34)