In [1]:
import pandas as pd
import numpy as np

In [2]:
import allel # ref - https://scikit-allel.readthedocs.io/en/stable/io.html#variant-call-format-vcf

In [3]:
vcf_folder = "../dataset/"
vcf_file_freebayes = "real1-freebayes.vcf.gz"
vcf_file_mutect = "real1-mutect2.vcf.gz"
vcf_file_vardict = "real1-vardict.vcf.gz"
vcf_file_varscan = "real1-varscan.vcf.gz"

In [4]:
df_freebayes = allel.vcf_to_dataframe(vcf_folder + vcf_file_freebayes)

In [5]:
df_freebayes.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10177,rs367896724,A,AC,,,46.599998,False
1,1,10583,rs58108140,G,A,,,271.100006,False
2,1,12783,rs62635284,G,A,,,2044.599976,False
3,1,13116,rs62635286,T,G,,,1762.699951,False
4,1,13118,rs62028691,A,G,,,1762.699951,False


In [6]:
df_mutect = allel.vcf_to_dataframe(vcf_folder + vcf_file_mutect)
df_mutect.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,13110,rs540538026,G,A,,,,True
1,1,13649,rs879707275,G,C,,,,False
2,1,13813,.,T,G,,,,False
3,1,13838,rs28428499,C,T,,,,False
4,1,15015,.,G,C,,,,True


In [7]:
df_vardict = allel.vcf_to_dataframe(vcf_folder + vcf_file_vardict)
df_vardict.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10230,rs775928745,AC,A,,,106.0,False
1,1,10231,rs200279319,C,A,,,75.0,False
2,1,10250,.,ACCCTA,CCCT,,,42.0,False
3,1,10583,rs58108140,G,A,,,122.0,False
4,1,12783,rs62635284,G,A,,,190.0,False


In [8]:
df_varscan = allel.vcf_to_dataframe(vcf_folder + vcf_file_varscan)
df_varscan.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10146,rs779258992,AC,A,,,0.0,False
1,1,10153,.,A,AC,,,0.0,False
2,1,10177,rs367896724,A,AC,,,0.0,False
3,1,10230,rs775928745,AC,A,,,0.0,False
4,1,10237,.,A,AC,,,0.0,False


In [9]:
print(f"Real freebayes shape = {df_freebayes.shape}")
print(f"Real Mutect shape = {df_mutect.shape}")
print(f"Real Vardict shape = {df_vardict.shape}")
print(f"Real Varscan shape = {df_varscan.shape}")

Real freebayes shape = (4990758, 9)
Real Mutect shape = (112953, 9)
Real Vardict shape = (4830315, 9)
Real Varscan shape = (4920452, 9)


In [49]:
common_chrom_names = set(df_freebayes.CHROM.unique()) & set(df_mutect.CHROM.unique()) & set(df_vardict.CHROM.unique()) & set(df_varscan.CHROM.unique())

In [52]:
set(df_freebayes.CHROM.unique()) - common_chrom_names

{'GL000191.1',
 'GL000196.1',
 'GL000197.1',
 'GL000200.1',
 'GL000201.1',
 'GL000202.1',
 'GL000203.1',
 'GL000206.1',
 'GL000207.1',
 'GL000209.1',
 'GL000210.1',
 'GL000215.1',
 'GL000223.1',
 'GL000227.1',
 'GL000238.1',
 'GL000242.1',
 'GL000246.1',
 'GL000249.1'}

In [53]:
set(df_mutect.CHROM.unique()) - common_chrom_names

{'MT'}

In [54]:
set(df_vardict.CHROM.unique()) - common_chrom_names

{'GL000191.1',
 'GL000196.1',
 'GL000197.1',
 'GL000201.1',
 'GL000202.1',
 'GL000203.1',
 'GL000206.1',
 'GL000207.1',
 'GL000209.1',
 'GL000210.1',
 'GL000215.1',
 'GL000223.1',
 'GL000227.1',
 'GL000238.1',
 'GL000242.1',
 'GL000246.1',
 'GL000249.1'}

In [55]:
set(df_varscan.CHROM.unique()) - common_chrom_names

{'GL000191.1',
 'GL000196.1',
 'GL000197.1',
 'GL000200.1',
 'GL000201.1',
 'GL000202.1',
 'GL000203.1',
 'GL000206.1',
 'GL000207.1',
 'GL000209.1',
 'GL000210.1',
 'GL000215.1',
 'GL000223.1',
 'GL000227.1',
 'GL000238.1',
 'GL000242.1',
 'GL000246.1',
 'GL000249.1',
 'MT'}

In [10]:
m1 = pd.merge(df_freebayes, df_mutect, on = ["CHROM", "POS"], how="outer", suffixes = ("_freebayes", "_mutect"))

In [11]:
m1.shape

(5097215, 16)

In [12]:
m2 = pd.merge(m1, df_vardict, on = ["CHROM", "POS"], how="outer")
m2.rename(columns={"ID": "ID_vardict", "REF": "REF_vardict", "ALT_1": "ALT_1_vardict", "ALT_2": "ALT_2_vardict", \
                  "ALT_3": "ALT_3_vardict", "QUAL": "QUAL_vardict", "FILTER_PASS": "FILTER_PASS_vardict"}, inplace=True)

In [13]:
m2.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_mutect,QUAL_mutect,FILTER_PASS_mutect,ID_vardict,REF_vardict,ALT_1_vardict,ALT_2_vardict,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict
0,1,10177,rs367896724,A,AC,,,46.599998,False,,...,,,,,,,,,,
1,1,10583,rs58108140,G,A,,,271.100006,False,,...,,,,rs58108140,G,A,,,122.0,False
2,1,12783,rs62635284,G,A,,,2044.599976,False,,...,,,,rs62635284,G,A,,,190.0,False
3,1,13116,rs62635286,T,G,,,1762.699951,False,,...,,,,rs62635286,T,G,,,230.0,False
4,1,13118,rs62028691,A,G,,,1762.699951,False,,...,,,,rs62028691,A,G,,,233.0,False


In [14]:
m2.shape

(5412760, 23)

In [15]:
m3 = pd.merge(m2, df_varscan, on = ["CHROM", "POS"], how="outer")
m3.rename(columns={"ID": "ID_varscan", "REF": "REF_varscan", "ALT_1": "ALT_1_varscan", "ALT_2": "ALT_2_varscan", \
                  "ALT_3": "ALT_3_varscan", "QUAL": "QUAL_varscan", "FILTER_PASS": "FILTER_PASS_varscan"}, inplace=True)
m3.shape

(5790219, 30)

In [16]:
m3.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,10177,rs367896724,A,AC,,,46.599998,False,,...,,,,rs367896724,A,AC,,,0.0,False
1,1,10583,rs58108140,G,A,,,271.100006,False,,...,,122.0,False,rs58108140,G,A,,,0.0,False
2,1,12783,rs62635284,G,A,,,2044.599976,False,,...,,190.0,False,rs62635284,G,A,,,0.0,False
3,1,13116,rs62635286,T,G,,,1762.699951,False,,...,,230.0,False,,,,,,,
4,1,13118,rs62028691,A,G,,,1762.699951,False,,...,,233.0,False,,,,,,,


In [17]:
m3.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,1,10177,False,,,False
1,1,10583,False,,False,False
2,1,12783,False,,False,False
3,1,13116,False,,False,
4,1,13118,False,,False,


In [18]:
df_merged = m3

In [19]:
# add third category instead of False?
df_merged["FILTER_PASS_freebayes"].fillna(False, inplace=True)
df_merged["FILTER_PASS_vardict"].fillna(False, inplace=True)
df_merged["FILTER_PASS_mutect"].fillna(False, inplace=True)
df_merged["FILTER_PASS_varscan"].fillna(False, inplace=True)
df_merged.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,1,10177,False,False,False,False
1,1,10583,False,False,False,False
2,1,12783,False,False,False,False
3,1,13116,False,False,False,False
4,1,13118,False,False,False,False


In [20]:
truth = pd.read_csv(vcf_folder + "real1_truth.bed", sep = "\t", header = None)
truth.columns = ["CHROM", "POS_START", "POS_END"]
truth.head()

Unnamed: 0,CHROM,POS_START,POS_END
0,1,2180985,2180985
1,1,5035185,5035185
2,1,8881322,8881322
3,1,8929624,8929624
4,1,9196716,9196716


In [21]:
(truth.POS_START == truth.POS_END).sum()

1319

In [22]:
truth.shape

(1319, 3)

In [23]:
truth.merge(df_merged, left_on = ["CHROM", "POS_START"], right_on = ["CHROM", "POS"])
# this is smaller than the actual truth file indicating the all 4 variant callers missed to classify some true variants

Unnamed: 0,CHROM,POS_START,POS_END,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,2180985,2180985,2180985,.,A,G,,,1301.400024,...,,193.0,True,.,A,G,,,0.0,False
1,1,5035185,5035185,5035185,rs578164071,C,T,,,279.399994,...,,130.0,True,rs578164071,C,T,,,0.0,True
2,1,8881322,8881322,8881322,.,G,A,,,123.900002,...,,93.0,True,.,G,A,,,0.0,False
3,1,8929624,8929624,8929624,.,A,G,,,93.900002,...,,86.0,True,.,A,G,,,0.0,False
4,1,9196716,9196716,9196716,,,,,,,...,,47.0,True,.,C,T,,,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,X,141680928,141680928,141680928,rs1047604339,C,A,,,39.400002,...,,62.0,True,rs1047604339,C,A,,,0.0,True
1304,X,142275497,142275497,142275497,.,T,C,,,360.600006,...,,131.0,True,.,T,C,,,0.0,False
1305,X,144737796,144737796,144737796,.,C,G,,,112.199997,...,,97.0,True,.,C,G,,,0.0,True
1306,Y,15154839,15154839,15154839,.,G,C,,,379.500000,...,,127.0,True,.,G,C,,,0.0,False


In [24]:
df_merged_with_truth = pd.merge(truth, df_merged, left_on=["CHROM", "POS_START"], right_on = ["CHROM", "POS"], how="outer")
df_merged_with_truth.head()

Unnamed: 0,CHROM,POS_START,POS_END,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,2180985.0,2180985.0,2180985.0,.,A,G,,,1301.400024,...,,193.0,True,.,A,G,,,0.0,False
1,1,5035185.0,5035185.0,5035185.0,rs578164071,C,T,,,279.399994,...,,130.0,True,rs578164071,C,T,,,0.0,True
2,1,8881322.0,8881322.0,8881322.0,.,G,A,,,123.900002,...,,93.0,True,.,G,A,,,0.0,False
3,1,8929624.0,8929624.0,8929624.0,.,A,G,,,93.900002,...,,86.0,True,.,A,G,,,0.0,False
4,1,9196716.0,9196716.0,9196716.0,,,,,,,...,,47.0,True,.,C,T,,,0.0,True


In [25]:
df_merged_with_truth.shape

(5790230, 32)

In [26]:
def set_true_label(row):
    if str(row) == "nan":
        return False
    else:
        return True

In [27]:
df_merged_with_truth["truth"] = df_merged_with_truth["POS_START"].apply(lambda row:set_true_label(row))
df_merged_with_truth.sum()

CHROM                    1111111111111111111111111111111111111111111111...
POS_START                                                   105629169575.0
POS_END                                                     105629169575.0
POS                                                      430122941474015.0
QUAL_freebayes                                                6039412224.0
FILTER_PASS_freebayes                                                13370
ALT_2_mutect                                                           0.0
ALT_3_mutect                                                           0.0
QUAL_mutect                                                            0.0
FILTER_PASS_mutect                                                    7133
ALT_2_vardict                                                          0.0
ALT_3_vardict                                                          0.0
QUAL_vardict                                                   717389888.0
FILTER_PASS_vardict      

In [28]:
def decide_prediction(row):
    if row["FILTER_PASS_freebayes"] + row["FILTER_PASS_mutect"] + row["FILTER_PASS_vardict"] + row["FILTER_PASS_varscan"] >= 2:
        return True
    else:
        return False

In [29]:
df_merged_with_truth["preds"] = df_merged_with_truth.apply(lambda row: decide_prediction(row), axis = 1)

In [30]:
df_merged_with_truth["preds"].sum()

7867

In [31]:
extras_in_truth = df_merged_with_truth[df_merged_with_truth.POS.isna()].index

In [32]:
df_merged_with_truth.drop(extras_in_truth, inplace=True)

In [37]:
df_merged_with_truth.isna().sum()

CHROM                          0
POS_START                5788911
POS_END                  5788911
POS                            0
ID_freebayes              799461
REF_freebayes             799461
ALT_1_freebayes           799461
ALT_2_freebayes          5662164
ALT_3_freebayes          5780010
QUAL_freebayes            799462
FILTER_PASS_freebayes          0
ID_mutect                5677201
REF_mutect               5677201
ALT_1_mutect             5677201
ALT_2_mutect             5790219
ALT_3_mutect             5790219
QUAL_mutect              5790219
FILTER_PASS_mutect             0
ID_vardict                933993
REF_vardict               933993
ALT_1_vardict             933993
ALT_2_vardict            5790219
ALT_3_vardict            5790219
QUAL_vardict              934423
FILTER_PASS_vardict            0
ID_varscan                844188
REF_varscan               844188
ALT_1_varscan             844188
ALT_2_varscan            5790219
ALT_3_varscan            5790219
QUAL_varsc

#### Confusion matrix comparing against truth

In [35]:
from sklearn.metrics import confusion_matrix

In [40]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_freebayes)).ravel()
tn, fp, fn, tp

(5776680, 12231, 169, 1139)

In [41]:
# least false positives
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_mutect)).ravel()
tn, fp, fn, tp

(5782998, 5913, 88, 1220)

In [42]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_vardict)).ravel()
tn, fp, fn, tp

(5770963, 17948, 161, 1147)

In [43]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_varscan)).ravel()
tn, fp, fn, tp

(5725961, 62950, 1057, 251)

In [56]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.preds)).ravel()
tn, fp, fn, tp

(5782257, 6654, 95, 1213)

In [62]:
from scipy.stats import chi2_contingency
chi2_fb, _,_,_ = chi2_contingency(confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_freebayes)))
chi2_mu, _,_,_ = chi2_contingency(confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_mutect)))
chi2_vd, _,_,_ = chi2_contingency(confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_vardict)))
chi2_vs, _,_,_ = chi2_contingency(confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_varscan)))

In [63]:
def predict_label(row):
    denom = chi2_fb + chi2_mu + chi2_vd + chi2_vs
    weight_fb = chi2_fb/denom
    weight_mu = chi2_mu/denom
    weight_vd = chi2_vd/denom
    weight_vs = chi2_vs/denom
    pdt = (weight_fb * row["FILTER_PASS_freebayes"]) + (weight_mu * row["FILTER_PASS_mutect"]) + (weight_vd * row["FILTER_PASS_vardict"]) + (weight_vs * row["FILTER_PASS_varscan"])
    if pdt > 0.5:
        return True
    else:
        return False

In [67]:
df_merged_with_truth["weighted_preds_chi2_based"] = df_merged_with_truth[["FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]].apply(predict_label, axis = 1)

In [70]:
def predict_label_three_best(row):
    if row["FILTER_PASS_mutect"] == False:
        return False
    if int(row["FILTER_PASS_freebayes"]) + int(row["FILTER_PASS_vardict"]) + int(row["FILTER_PASS_varscan"]) >= 2:
        return True
    return False

In [71]:
df_merged_with_truth["three_best_mutect_true"] = df_merged_with_truth[["FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]].apply(predict_label_three_best, axis = 1)

In [74]:
def predict_label_four_best(row):
    if row["FILTER_PASS_mutect"] == False:
        return False
    if row["FILTER_PASS_freebayes"] + row["FILTER_PASS_vardict"] + row["FILTER_PASS_varscan"] >= 3:
        return True
    return False

In [75]:
df_merged_with_truth["four_best_mutect_true"] = df_merged_with_truth[["FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]].apply(predict_label_four_best, axis = 1)

#### Chromosome based variations

We split the dataset by chromosome and see if any variant calling method has a tendency to do better in specific chromosomes.

In [59]:
chr1_df = df_merged_with_truth[df_merged_with_truth.CHROM == "1"]
print(confusion_matrix(list(chr1_df.truth), list(chr1_df.FILTER_PASS_freebayes)).ravel())
print(confusion_matrix(list(chr1_df.truth), list(chr1_df.FILTER_PASS_mutect)).ravel())
print(confusion_matrix(list(chr1_df.truth), list(chr1_df.FILTER_PASS_vardict)).ravel())
print(confusion_matrix(list(chr1_df.truth), list(chr1_df.FILTER_PASS_varscan)).ravel())

[439964   1027     13     83]
[440403    588      6     90]
[439561   1430     12     84]
[436313   4678     74     22]


#### Metric evaluation

In [73]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(f'F1 score = {f1_score(df_merged_with_truth["truth"], df_merged_with_truth["preds"])}')
print(f'Precision = {precision_score(df_merged_with_truth["truth"], df_merged_with_truth["preds"])}')
print(f'Recall = {recall_score(df_merged_with_truth["truth"], df_merged_with_truth["preds"])}')

F1 score = 0.26441416893732966
Precision = 0.15418838184822678
Recall = 0.9273700305810397


In [65]:
from sklearn.metrics import classification_report

In [75]:
print(classification_report(df_merged_with_truth["truth"], df_merged_with_truth["preds"]))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   5788911
        True       0.15      0.93      0.26      1308

    accuracy                           1.00   5790219
   macro avg       0.58      0.96      0.63   5790219
weighted avg       1.00      1.00      1.00   5790219



In [68]:
print(classification_report(df_merged_with_truth["truth"], df_merged_with_truth["weighted_preds_chi2_based"]))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   5788911
        True       0.17      0.93      0.29      1308

    accuracy                           1.00   5790219
   macro avg       0.59      0.97      0.64   5790219
weighted avg       1.00      1.00      1.00   5790219



In [69]:
confusion_matrix(list(df_merged_with_truth["truth"]), list(df_merged_with_truth["weighted_preds_chi2_based"]))

array([[5782998,    5913],
       [     88,    1220]])

In [72]:
print(classification_report(df_merged_with_truth["truth"], df_merged_with_truth["three_best_mutect_true"]))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   5788911
        True       0.72      0.83      0.78      1308

    accuracy                           1.00   5790219
   macro avg       0.86      0.92      0.89   5790219
weighted avg       1.00      1.00      1.00   5790219



In [73]:
confusion_matrix(list(df_merged_with_truth["truth"]), list(df_merged_with_truth["three_best_mutect_true"]))

array([[5788493,     418],
       [    216,    1092]])

In [76]:
print(classification_report(df_merged_with_truth["truth"], df_merged_with_truth["four_best_mutect_true"]))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   5788911
        True       0.65      0.13      0.22      1308

    accuracy                           1.00   5790219
   macro avg       0.83      0.57      0.61   5790219
weighted avg       1.00      1.00      1.00   5790219



In [77]:
confusion_matrix(list(df_merged_with_truth["truth"]), list(df_merged_with_truth["four_best_mutect_true"]))

array([[5788818,      93],
       [   1134,     174]])