In [1]:
import pandas as pd
import numpy as np

In [2]:
import allel # ref - https://scikit-allel.readthedocs.io/en/stable/io.html#variant-call-format-vcf

In [3]:
vcf_folder = "../dataset/"
vcf_file_freebayes = "syn3-freebayes.vcf.gz"
vcf_file_mutect = "syn3-mutect2.vcf.gz"
vcf_file_vardict = "syn3-vardict.vcf.gz"
vcf_file_varscan = "syn3-varscan.vcf.gz"

In [4]:
df_freebayes = allel.vcf_to_dataframe(vcf_folder + vcf_file_freebayes)

In [5]:
df_freebayes.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10177,rs367896724,A,AC,,,233.199997,False
1,1,10352,rs555500075;rs145072688,T,TA,,,330.600006,False
2,1,10583,rs58108140,G,A,,,97.699997,False
3,1,10623,.,T,C,,,9.6,True
4,1,10649,.,G,A,,,16.700001,False


In [6]:
df_mutect = allel.vcf_to_dataframe(vcf_folder + vcf_file_mutect)
df_mutect.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,137221,.,T,G,,,,False
1,1,237762,.,G,A,,,,False
2,1,252853,rs9699802,T,C,,,,False
3,1,252873,rs9700554,G,A,,,,False
4,1,526346,.,C,A,,,,False


In [7]:
df_vardict = allel.vcf_to_dataframe(vcf_folder + vcf_file_vardict)
df_vardict.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10231,rs200279319,C,A,,,111.0,False
1,1,10329,rs150969722,AC,A,,,160.0,False
2,1,10352,rs555500075;rs145072688,T,TA,,,148.0,False
3,1,10469,rs370233998,C,G,,,83.0,False
4,1,10519,rs62636508,G,C,,,70.0,False


In [8]:
df_varscan = allel.vcf_to_dataframe(vcf_folder + vcf_file_varscan)
df_varscan.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10177,rs367896724,A,AC,,,0.0,False
1,1,10230,rs775928745,AC,A,,,0.0,False
2,1,10234,rs145599635,C,T,,,0.0,False
3,1,10247,.,TA,T,,,0.0,False
4,1,10248,rs148908337,A,T,,,0.0,False


In [9]:
print(f"Real freebayes shape = {df_freebayes.shape}")
print(f"Real Mutect shape = {df_mutect.shape}")
print(f"Real Vardict shape = {df_vardict.shape}")
print(f"Real Varscan shape = {df_varscan.shape}")

Real freebayes shape = (4869534, 9)
Real Mutect shape = (120023, 9)
Real Vardict shape = (4911379, 9)
Real Varscan shape = (4702513, 9)


In [10]:
m1 = pd.merge(df_freebayes, df_mutect, on = ["CHROM", "POS"], how="outer", suffixes = ("_freebayes", "_mutect"))

In [11]:
m1.shape

(4971952, 16)

In [12]:
m2 = pd.merge(m1, df_vardict, on = ["CHROM", "POS"], how="outer")
m2.rename(columns={"ID": "ID_vardict", "REF": "REF_vardict", "ALT_1": "ALT_1_vardict", "ALT_2": "ALT_2_vardict", \
                  "ALT_3": "ALT_3_vardict", "QUAL": "QUAL_vardict", "FILTER_PASS": "FILTER_PASS_vardict"}, inplace=True)

In [13]:
m2.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_mutect,QUAL_mutect,FILTER_PASS_mutect,ID_vardict,REF_vardict,ALT_1_vardict,ALT_2_vardict,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict
0,1,10177,rs367896724,A,AC,,,233.199997,False,,...,,,,,,,,,,
1,1,10352,rs555500075;rs145072688,T,TA,,,330.600006,False,,...,,,,rs555500075;rs145072688,T,TA,,,148.0,False
2,1,10583,rs58108140,G,A,,,97.699997,False,,...,,,,rs58108140,G,A,,,92.0,False
3,1,10623,.,T,C,,,9.6,True,,...,,,,.,T,C,,,67.0,False
4,1,10649,.,G,A,,,16.700001,False,,...,,,,.,G,A,,,66.0,False


In [14]:
m2.shape

(5332326, 23)

In [15]:
m3 = pd.merge(m2, df_varscan, on = ["CHROM", "POS"], how="outer")
m3.rename(columns={"ID": "ID_varscan", "REF": "REF_varscan", "ALT_1": "ALT_1_varscan", "ALT_2": "ALT_2_varscan", \
                  "ALT_3": "ALT_3_varscan", "QUAL": "QUAL_varscan", "FILTER_PASS": "FILTER_PASS_varscan"}, inplace=True)
m3.shape

(5633087, 30)

In [16]:
m3.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,10177,rs367896724,A,AC,,,233.199997,False,,...,,,,rs367896724,A,AC,,,0.0,False
1,1,10352,rs555500075;rs145072688,T,TA,,,330.600006,False,,...,,148.0,False,rs555500075;rs145072688,T,TA,,,0.0,False
2,1,10583,rs58108140,G,A,,,97.699997,False,,...,,92.0,False,,,,,,,
3,1,10623,.,T,C,,,9.6,True,,...,,67.0,False,,,,,,,
4,1,10649,.,G,A,,,16.700001,False,,...,,66.0,False,,,,,,,


In [17]:
m3.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,1,10177,False,,,False
1,1,10352,False,,False,False
2,1,10583,False,,False,
3,1,10623,True,,False,
4,1,10649,False,,False,


In [18]:
df_merged = m3

In [19]:
# add third category instead of False?
df_merged["FILTER_PASS_freebayes"].fillna(False, inplace=True)
df_merged["FILTER_PASS_vardict"].fillna(False, inplace=True)
df_merged["FILTER_PASS_mutect"].fillna(False, inplace=True)
df_merged["FILTER_PASS_varscan"].fillna(False, inplace=True)
df_merged.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,1,10177,False,False,False,False
1,1,10352,False,False,False,False
2,1,10583,False,False,False,False
3,1,10623,True,False,False,False
4,1,10649,False,False,False,False


In [20]:
truth = pd.read_csv(vcf_folder + "syn3_truth.bed", sep = "\t", header = None)
truth.columns = ["CHROM", "POS_START", "POS_END"]
truth.head()

Unnamed: 0,CHROM,POS_START,POS_END
0,1,836343,836343
1,1,1259680,1259680
2,1,2165452,2165452
3,1,2361895,2361895
4,1,2427551,2427551


In [21]:
(truth.POS_START == truth.POS_END).sum()

7903

In [22]:
truth.shape

(7903, 3)

In [23]:
truth.merge(df_merged, left_on = ["CHROM", "POS_START"], right_on = ["CHROM", "POS"])
# this is smaller than the actual truth file indicating the all 4 variant callers missed to classify some true variants

Unnamed: 0,CHROM,POS_START,POS_END,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,836343,836343,836343,.,C,A,,,399.200012,...,,147.0,True,.,C,A,,,0.0,False
1,1,1259680,1259680,1259680,.,G,A,,,77.599998,...,,90.0,True,,,,,,,False
2,1,2165452,2165452,2165452,.,A,T,,,90.800003,...,,109.0,True,.,A,T,,,0.0,True
3,1,2361895,2361895,2361895,.,T,C,,,382.299988,...,,120.0,True,.,T,C,,,0.0,False
4,1,2427551,2427551,2427551,.,A,T,,,185.399994,...,,119.0,True,.,A,T,,,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7762,X,153972172,153972172,153972172,.,T,C,,,128.100006,...,,93.0,True,.,T,C,,,0.0,False
7763,X,154251600,154251600,154251600,.,C,A,,,6.300000,...,,51.0,True,.,C,A,,,0.0,True
7764,X,154622153,154622153,154622153,.,A,C,,,156.899994,...,,110.0,True,.,A,C,,,0.0,True
7765,X,154635072,154635072,154635072,.,T,C,,,48.000000,...,,84.0,True,.,T,C,,,0.0,True


In [24]:
df_merged_with_truth = pd.merge(truth, df_merged, left_on=["CHROM", "POS_START"], right_on = ["CHROM", "POS"], how="outer")
df_merged_with_truth.head()

Unnamed: 0,CHROM,POS_START,POS_END,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,836343.0,836343.0,836343.0,.,C,A,,,399.200012,...,,147.0,True,.,C,A,,,0.0,False
1,1,1259680.0,1259680.0,1259680.0,.,G,A,,,77.599998,...,,90.0,True,,,,,,,False
2,1,2165452.0,2165452.0,2165452.0,.,A,T,,,90.800003,...,,109.0,True,.,A,T,,,0.0,True
3,1,2361895.0,2361895.0,2361895.0,.,T,C,,,382.299988,...,,120.0,True,.,T,C,,,0.0,False
4,1,2427551.0,2427551.0,2427551.0,.,A,T,,,185.399994,...,,119.0,True,.,A,T,,,0.0,True


In [25]:
df_merged_with_truth.shape

(5633223, 32)

In [26]:
def set_true_label(row):
    if str(row) == "nan":
        return False
    else:
        return True

In [27]:
df_merged_with_truth["truth"] = df_merged_with_truth["POS_START"].apply(lambda row:set_true_label(row))
df_merged_with_truth.sum()

CHROM                    1111111111111111111111111111111111111111111111...
POS_START                                                   636609463103.0
POS_END                                                     636609463103.0
POS                                                      420676957930107.0
QUAL_freebayes                                                5242052608.0
FILTER_PASS_freebayes                                                21491
ALT_2_mutect                                                           0.0
ALT_3_mutect                                                           0.0
QUAL_mutect                                                            0.0
FILTER_PASS_mutect                                                   21983
ALT_2_vardict                                                          0.0
ALT_3_vardict                                                          0.0
QUAL_vardict                                                   660507072.0
FILTER_PASS_vardict      

In [28]:
def decide_prediction(row):
    if row["FILTER_PASS_freebayes"] + row["FILTER_PASS_mutect"] + row["FILTER_PASS_vardict"] + row["FILTER_PASS_varscan"] >= 2:
        return True
    else:
        return False

In [29]:
df_merged_with_truth["preds"] = df_merged_with_truth.apply(lambda row: decide_prediction(row), axis = 1)

In [30]:
df_merged_with_truth["preds"].sum()

19980

In [31]:
extras_in_truth = df_merged_with_truth[df_merged_with_truth.POS.isna()].index

In [32]:
df_merged_with_truth.drop(extras_in_truth, inplace=True)

In [33]:
def predict_label_three_best(row):
    if row["FILTER_PASS_mutect"] == False:
        return False
    if int(row["FILTER_PASS_freebayes"]) + int(row["FILTER_PASS_vardict"]) + int(row["FILTER_PASS_varscan"]) >= 2:
        return True
    return False
df_merged_with_truth["three_best_mutect_true"] = df_merged_with_truth[["FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]].apply(predict_label_three_best, axis = 1)

In [34]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(f'F1 score = {f1_score(df_merged_with_truth["truth"], df_merged_with_truth["preds"])}')
print(f'Precision = {precision_score(df_merged_with_truth["truth"], df_merged_with_truth["preds"])}')
print(f'Recall = {recall_score(df_merged_with_truth["truth"], df_merged_with_truth["preds"])}')

F1 score = 0.5397340253000324
Precision = 0.3747747747747748
Recall = 0.9640787949015064


In [35]:
from sklearn.metrics import classification_report
print(classification_report(df_merged_with_truth["truth"], df_merged_with_truth["preds"]))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   5625320
        True       0.37      0.96      0.54      7767

    accuracy                           1.00   5633087
   macro avg       0.69      0.98      0.77   5633087
weighted avg       1.00      1.00      1.00   5633087



In [36]:
print(classification_report(df_merged_with_truth["truth"], df_merged_with_truth["three_best_mutect_true"]))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   5625320
        True       0.56      0.93      0.70      7767

    accuracy                           1.00   5633087
   macro avg       0.78      0.96      0.85   5633087
weighted avg       1.00      1.00      1.00   5633087

