In [1]:
import pandas as pd
import numpy as np

In [2]:
import allel # ref - https://scikit-allel.readthedocs.io/en/stable/io.html#variant-call-format-vcf

In [3]:
vcf_folder = "../dataset/"
vcf_file_freebayes = "real2_freebayes_rest.vcf.gz"
vcf_file_mutect = "real2_mutect_rest.vcf.gz"
vcf_file_vardict = "real2_vardict_rest.vcf.gz"
vcf_file_varscan = "real2_varscan_rest.vcf.gz"

In [4]:
df_freebayes = allel.vcf_to_dataframe(vcf_folder + vcf_file_freebayes)

In [5]:
df_freebayes.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,6,69321,.,G,A,,,18.9,True
1,6,73924,.,AAGAGAAAG,AAAAG,A,,44.700001,False
2,6,73928,rs11242695,G,A,,,44.700001,False
3,6,86583,.,G,A,,,152.699997,False
4,6,100908,.,C,T,,,15.2,False


In [6]:
df_mutect = allel.vcf_to_dataframe(vcf_folder + vcf_file_mutect)
df_mutect.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,6,90274,.,A,G,,,,False
1,6,90285,rs396444,A,T,,,,False
2,6,101018,.,T,C,,,,False
3,6,121056,rs62392200,G,A,,,,False
4,6,215004,.,A,G,,,,False


In [7]:
df_vardict = allel.vcf_to_dataframe(vcf_folder + vcf_file_vardict)
df_vardict.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,6,69321,.,G,A,,,58.0,False
1,6,69405,rs879530989,C,T,,,97.0,False
2,6,73924,rs201634483,AAGAG,A,,,125.0,False
3,6,86583,.,G,A,,,123.0,False
4,6,88753,.,T,G,,,30.0,False


In [8]:
df_varscan = allel.vcf_to_dataframe(vcf_folder + vcf_file_varscan)
df_varscan.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,6,63629,.,CAGCAGTGT,C,,,0.0,False
1,6,69321,.,G,A,,,0.0,False
2,6,69405,rs879530989,C,T,,,0.0,False
3,6,73924,rs201634483,AAGAG,A,,,0.0,False
4,6,86488,.,G,A,,,0.0,False


In [9]:
print(f"Real freebayes shape = {df_freebayes.shape}")
print(f"Real Mutect shape = {df_mutect.shape}")
print(f"Real Vardict shape = {df_vardict.shape}")
print(f"Real Varscan shape = {df_varscan.shape}")

Real freebayes shape = (3202219, 9)
Real Mutect shape = (134415, 9)
Real Vardict shape = (3234298, 9)
Real Varscan shape = (3327378, 9)


In [10]:
m1 = pd.merge(df_freebayes, df_mutect, on = ["CHROM", "POS"], how="outer", suffixes = ("_freebayes", "_mutect"))

In [11]:
m1.shape

(3330333, 16)

In [12]:
m2 = pd.merge(m1, df_vardict, on = ["CHROM", "POS"], how="outer")
m2.rename(columns={"ID": "ID_vardict", "REF": "REF_vardict", "ALT_1": "ALT_1_vardict", "ALT_2": "ALT_2_vardict", \
                  "ALT_3": "ALT_3_vardict", "QUAL": "QUAL_vardict", "FILTER_PASS": "FILTER_PASS_vardict"}, inplace=True)

In [13]:
m2.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_mutect,QUAL_mutect,FILTER_PASS_mutect,ID_vardict,REF_vardict,ALT_1_vardict,ALT_2_vardict,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict
0,6,69321,.,G,A,,,18.9,True,,...,,,,.,G,A,,,58.0,False
1,6,73924,.,AAGAGAAAG,AAAAG,A,,44.700001,False,,...,,,,rs201634483,AAGAG,A,,,125.0,False
2,6,73928,rs11242695,G,A,,,44.700001,False,,...,,,,,,,,,,
3,6,86583,.,G,A,,,152.699997,False,,...,,,,.,G,A,,,123.0,False
4,6,100908,.,C,T,,,15.2,False,,...,,,,.,C,T,,,81.0,False


In [14]:
m2.shape

(3558115, 23)

In [15]:
m3 = pd.merge(m2, df_varscan, on = ["CHROM", "POS"], how="outer")
m3.rename(columns={"ID": "ID_varscan", "REF": "REF_varscan", "ALT_1": "ALT_1_varscan", "ALT_2": "ALT_2_varscan", \
                  "ALT_3": "ALT_3_varscan", "QUAL": "QUAL_varscan", "FILTER_PASS": "FILTER_PASS_varscan"}, inplace=True)
m3.shape

(3846560, 30)

In [16]:
m3.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,6,69321,.,G,A,,,18.9,True,,...,,58.0,False,.,G,A,,,0.0,False
1,6,73924,.,AAGAGAAAG,AAAAG,A,,44.700001,False,,...,,125.0,False,rs201634483,AAGAG,A,,,0.0,False
2,6,73928,rs11242695,G,A,,,44.700001,False,,...,,,,,,,,,,
3,6,86583,.,G,A,,,152.699997,False,,...,,123.0,False,,,,,,,
4,6,100908,.,C,T,,,15.2,False,,...,,81.0,False,.,C,T,,,0.0,False


In [17]:
m3.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,6,69321,True,,False,False
1,6,73924,False,,False,False
2,6,73928,False,,,
3,6,86583,False,,False,
4,6,100908,False,,False,False


In [18]:
df_merged = m3

In [19]:
# add third category instead of False?
df_merged["FILTER_PASS_freebayes"].fillna(False, inplace=True)
df_merged["FILTER_PASS_vardict"].fillna(False, inplace=True)
df_merged["FILTER_PASS_mutect"].fillna(False, inplace=True)
df_merged["FILTER_PASS_varscan"].fillna(False, inplace=True)
df_merged.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,6,69321,True,False,False,False
1,6,73924,False,False,False,False
2,6,73928,False,False,False,False
3,6,86583,False,False,False,False
4,6,100908,False,False,False,False


In [20]:
df_merged_with_truth = df_merged
df_merged_with_truth.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,6,69321,.,G,A,,,18.9,True,,...,,58.0,False,.,G,A,,,0.0,False
1,6,73924,.,AAGAGAAAG,AAAAG,A,,44.700001,False,,...,,125.0,False,rs201634483,AAGAG,A,,,0.0,False
2,6,73928,rs11242695,G,A,,,44.700001,False,,...,,,False,,,,,,,False
3,6,86583,.,G,A,,,152.699997,False,,...,,123.0,False,,,,,,,False
4,6,100908,.,C,T,,,15.2,False,,...,,81.0,False,.,C,T,,,0.0,False


In [21]:
df_merged_with_truth.shape

(3846560, 30)

In [22]:
df_merged_with_truth["FILTER_PASS_freebayes"].value_counts()

False    3837094
True        9466
Name: FILTER_PASS_freebayes, dtype: int64

In [23]:
df_merged_with_truth["FILTER_PASS_mutect"].value_counts()

False    3838446
True        8114
Name: FILTER_PASS_mutect, dtype: int64

In [24]:
df_merged_with_truth["FILTER_PASS_vardict"].value_counts()

False    3822766
True       23794
Name: FILTER_PASS_vardict, dtype: int64

In [25]:
df_merged_with_truth["FILTER_PASS_varscan"].value_counts()

False    3792766
True       53794
Name: FILTER_PASS_varscan, dtype: int64

In [27]:
df_merged_with_truth["FILTER_PASS_varscan"] + df_merged_with_truth["FILTER_PASS_vardict"] + df_merged_with_truth["FILTER_PASS_freebayes"]

  f"evaluating in Python space because the {repr(op_str)} "


0           True
1          False
2          False
3          False
4          False
           ...  
3846555    False
3846556    False
3846557    False
3846558    False
3846559     True
Length: 3846560, dtype: bool

In [30]:
def predict_label_three_best(row):
    if row["FILTER_PASS_mutect"] == False:
        return False
#     print(row["FILTER_PASS_freebayes"] + row["FILTER_PASS_vardict"] + row["FILTER_PASS_varscan"])
    if int(row["FILTER_PASS_freebayes"]) + int(row["FILTER_PASS_vardict"]) + int(row["FILTER_PASS_varscan"]) >= 2:
        return True
    return False

In [31]:
df_merged_with_truth["three_best_mutect_true"] = df_merged_with_truth[["FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]].apply(predict_label_three_best, axis = 1)

In [32]:
df_merged_with_truth["three_best_mutect_true"].value_counts()

False    3845606
True         954
Name: three_best_mutect_true, dtype: int64

In [33]:
bed_file_name = "../dataset/real2_part2_threeway_mutect_weighed.bed"

In [36]:
real2_preds_df = df_merged_with_truth[df_merged_with_truth["three_best_mutect_true"] == True][["CHROM", "POS"]]

In [38]:
real2_preds_df["POS_END"] = real2_preds_df["POS"]

In [39]:
real2_preds_df.columns = ['CHROM', 'POS_START', 'POS_END']
real2_preds_dft(real2_preds_df.shape)
real2_preds_df.head()

(954, 3)


Unnamed: 0,CHROM,POS_START,POS_END
14023,6,6524419,6524419
15375,6,7163472,7163472
15437,6,7255456,7255456
16010,6,7509535,7509535
24774,6,12542457,12542457


In [40]:
real2_preds_df.to_csv("../dataset/real2_part2.bed", sep="\t", index=False, header=False)