In [1]:
import pandas as pd
import numpy as np

In [2]:
import allel # ref - https://scikit-allel.readthedocs.io/en/stable/io.html#variant-call-format-vcf

In [3]:
vcf_folder = "../dataset/"
vcf_file_freebayes = "real2_freebayes_chr1to5.vcf.gz"
vcf_file_mutect = "real2_mutect_chr1to5.vcf.gz"
vcf_file_vardict = "real2_vardict_chr1to5.vcf.gz"
vcf_file_varscan = "real2_varscan_chr1to5.vcf.gz"

In [4]:
df_freebayes = allel.vcf_to_dataframe(vcf_folder + vcf_file_freebayes)

In [5]:
df_freebayes.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10146,rs779258992,AC,A,,,32.900002,False
1,1,10177,rs367896724,A,AC,,,314.799988,False
2,1,10352,rs555500075;rs145072688,T,TA,,,1229.400024,False
3,1,10623,.,T,C,,,120.5,False
4,1,10629,.,GGC,G,,,120.5,False


In [6]:
df_mutect = allel.vcf_to_dataframe(vcf_folder + vcf_file_mutect)
df_mutect.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,13079,rs78890234,C,G,,,,False
1,1,13110,rs540538026,G,A,,,,False
2,1,14542,rs1045833,A,G,,,,False
3,1,14574,rs28503599,A,G,,,,False
4,1,14815,.,C,T,,,,True


In [7]:
df_vardict = allel.vcf_to_dataframe(vcf_folder + vcf_file_vardict)
df_vardict.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10175,.,TAAC,T,,,147.0,False
1,1,10177,rs367896724,A,AC,,,179.0,False
2,1,10250,rs199706086,A,C,,,124.0,False
3,1,10291,rs145427775,C,T,,,156.0,True
4,1,10329,rs150969722,AC,A,,,183.0,False


In [8]:
df_varscan = allel.vcf_to_dataframe(vcf_folder + vcf_file_varscan)
df_varscan.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,FILTER_PASS
0,1,10146,rs779258992,AC,A,,,0.0,True
1,1,10165,rs796884232,A,AC,,,0.0,False
2,1,10177,rs367896724,A,AC,,,0.0,False
3,1,10230,rs775928745,AC,A,,,0.0,False
4,1,10234,rs145599635,C,T,,,0.0,False


In [9]:
print(f"Real freebayes shape = {df_freebayes.shape}")
print(f"Real Mutect shape = {df_mutect.shape}")
print(f"Real Vardict shape = {df_vardict.shape}")
print(f"Real Varscan shape = {df_varscan.shape}")

Real freebayes shape = (1762508, 9)
Real Mutect shape = (55744, 9)
Real Vardict shape = (1762739, 9)
Real Varscan shape = (1772390, 9)


In [10]:
m1 = pd.merge(df_freebayes, df_mutect, on = ["CHROM", "POS"], how="outer", suffixes = ("_freebayes", "_mutect"))

In [11]:
m1.shape

(1815683, 16)

In [12]:
m2 = pd.merge(m1, df_vardict, on = ["CHROM", "POS"], how="outer")
m2.rename(columns={"ID": "ID_vardict", "REF": "REF_vardict", "ALT_1": "ALT_1_vardict", "ALT_2": "ALT_2_vardict", \
                  "ALT_3": "ALT_3_vardict", "QUAL": "QUAL_vardict", "FILTER_PASS": "FILTER_PASS_vardict"}, inplace=True)

In [13]:
m2.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_mutect,QUAL_mutect,FILTER_PASS_mutect,ID_vardict,REF_vardict,ALT_1_vardict,ALT_2_vardict,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict
0,1,10146,rs779258992,AC,A,,,32.900002,False,,...,,,,,,,,,,
1,1,10177,rs367896724,A,AC,,,314.799988,False,,...,,,,rs367896724,A,AC,,,179.0,False
2,1,10352,rs555500075;rs145072688,T,TA,,,1229.400024,False,,...,,,,rs555500075;rs145072688,T,TA,,,202.0,False
3,1,10623,.,T,C,,,120.5,False,,...,,,,.,T,C,,,113.0,False
4,1,10629,.,GGC,G,,,120.5,False,,...,,,,.,GGC,G,,,109.0,False


In [14]:
m2.shape

(1912772, 23)

In [15]:
m3 = pd.merge(m2, df_varscan, on = ["CHROM", "POS"], how="outer")
m3.rename(columns={"ID": "ID_varscan", "REF": "REF_varscan", "ALT_1": "ALT_1_varscan", "ALT_2": "ALT_2_varscan", \
                  "ALT_3": "ALT_3_varscan", "QUAL": "QUAL_varscan", "FILTER_PASS": "FILTER_PASS_varscan"}, inplace=True)
m3.shape

(2003776, 30)

In [16]:
m3.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,10146,rs779258992,AC,A,,,32.900002,False,,...,,,,rs779258992,AC,A,,,0.0,True
1,1,10177,rs367896724,A,AC,,,314.799988,False,,...,,179.0,False,rs367896724,A,AC,,,0.0,False
2,1,10352,rs555500075;rs145072688,T,TA,,,1229.400024,False,,...,,202.0,False,rs555500075;rs145072688,T,TA,,,0.0,False
3,1,10623,.,T,C,,,120.5,False,,...,,113.0,False,.,T,C,,,0.0,False
4,1,10629,.,GGC,G,,,120.5,False,,...,,109.0,False,.,GGC,G,,,0.0,False


In [17]:
m3.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,1,10146,False,,,True
1,1,10177,False,,False,False
2,1,10352,False,,False,False
3,1,10623,False,,False,False
4,1,10629,False,,False,False


In [18]:
df_merged = m3

In [19]:
# add third category instead of False?
df_merged["FILTER_PASS_freebayes"].fillna(False, inplace=True)
df_merged["FILTER_PASS_vardict"].fillna(False, inplace=True)
df_merged["FILTER_PASS_mutect"].fillna(False, inplace=True)
df_merged["FILTER_PASS_varscan"].fillna(False, inplace=True)
df_merged.head()[["CHROM", "POS", "FILTER_PASS_freebayes", "FILTER_PASS_mutect", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]]

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect,FILTER_PASS_vardict,FILTER_PASS_varscan
0,1,10146,False,False,False,True
1,1,10177,False,False,False,False
2,1,10352,False,False,False,False
3,1,10623,False,False,False,False
4,1,10629,False,False,False,False


In [20]:
truth = pd.read_csv(vcf_folder + "real2_truth_chr1to5.bed", sep = "\t", header = None)
truth.columns = ["CHROM", "POS_START", "POS_END"]
truth.head()

Unnamed: 0,CHROM,POS_START,POS_END
0,1,2171787,2171787
1,1,9414323,9414323
2,1,13852321,13852321
3,1,14995104,14995104
4,1,20818544,20818544


In [21]:
(truth.POS_START == truth.POS_END).sum()

491

In [22]:
truth.shape

(491, 3)

In [24]:
truth.CHROM = truth.CHROM.astype("str")

In [25]:
truth.merge(df_merged, left_on = ["CHROM", "POS_START"], right_on = ["CHROM", "POS"])
# this is smaller than the actual truth file indicating the all 4 variant callers missed to classify some true variants

Unnamed: 0,CHROM,POS_START,POS_END,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,2171787,2171787,2171787,.,G,A,,,968.900024,...,,178.0,True,.,G,A,,,0.0,False
1,1,9414323,9414323,9414323,,,,,,,...,,,False,,,,,,,False
2,1,13852321,13852321,13852321,rs530448496,C,T,,,104.900002,...,,,False,rs530448496,C,T,,,0.0,True
3,1,14995104,14995104,14995104,.,C,T,,,1435.199951,...,,202.0,True,.,C,T,,,0.0,False
4,1,20818544,20818544,20818544,rs148257007,G,A,,,1494.599976,...,,196.0,True,rs148257007,G,A,,,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458,5,172996949,172996949,172996949,.,G,A,,,1192.400024,...,,196.0,True,.,G,A,,,0.0,False
459,5,173169047,173169047,173169047,.,C,A,,,1185.699951,...,,191.0,True,.,C,A,,,0.0,False
460,5,176198923,176198923,176198923,.,C,T,,,943.400024,...,,177.0,True,.,C,T,,,0.0,False
461,5,178004393,178004393,178004393,.,C,T,,,2065.199951,...,,219.0,True,.,C,T,,,0.0,False


In [26]:
df_merged_with_truth = pd.merge(truth, df_merged, left_on=["CHROM", "POS_START"], right_on = ["CHROM", "POS"], how="outer")
df_merged_with_truth.head()

Unnamed: 0,CHROM,POS_START,POS_END,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,1,2171787.0,2171787.0,2171787.0,.,G,A,,,968.900024,...,,178.0,True,.,G,A,,,0.0,False
1,1,9414323.0,9414323.0,9414323.0,,,,,,,...,,,False,,,,,,,False
2,1,13852321.0,13852321.0,13852321.0,rs530448496,C,T,,,104.900002,...,,,False,rs530448496,C,T,,,0.0,True
3,1,14995104.0,14995104.0,14995104.0,.,C,T,,,1435.199951,...,,202.0,True,.,C,T,,,0.0,False
4,1,20818544.0,20818544.0,20818544.0,rs148257007,G,A,,,1494.599976,...,,196.0,True,rs148257007,G,A,,,0.0,False


In [27]:
df_merged_with_truth.shape

(2003804, 32)

In [28]:
def set_true_label(row):
    if str(row) == "nan":
        return False
    else:
        return True

In [29]:
df_merged_with_truth["truth"] = df_merged_with_truth["POS_START"].apply(lambda row:set_true_label(row))

In [30]:
extras_in_truth = df_merged_with_truth[df_merged_with_truth.POS.isna()].index
extras_in_truth

Int64Index([ 25,  54,  67,  89,  95, 111, 118, 119, 127, 165, 196, 228, 235,
            240, 261, 314, 315, 322, 323, 371, 384, 426, 442, 443, 444, 446,
            456, 489],
           dtype='int64')

In [31]:
df_merged_with_truth.drop(extras_in_truth, inplace=True)

In [32]:
df_merged_with_truth.isna().sum()

CHROM                          0
POS_START                2003313
POS_END                  2003313
POS                            0
ID_freebayes              241268
REF_freebayes             241268
ALT_1_freebayes           241268
ALT_2_freebayes          1981204
ALT_3_freebayes          2002388
QUAL_freebayes            241269
FILTER_PASS_freebayes          0
ID_mutect                1948009
REF_mutect               1948009
ALT_1_mutect             1948009
ALT_2_mutect             2003776
ALT_3_mutect             2003776
QUAL_mutect              2003776
FILTER_PASS_mutect             0
ID_vardict                232074
REF_vardict               232074
ALT_1_vardict             232074
ALT_2_vardict            2003776
ALT_3_vardict            2003776
QUAL_vardict              233948
FILTER_PASS_vardict            0
ID_varscan                222831
REF_varscan               222831
ALT_1_varscan             222831
ALT_2_varscan            2003776
ALT_3_varscan            2003776
QUAL_varsc

In [33]:
df_merged_with_truth["QUAL_freebayes"].fillna(df_merged_with_truth["QUAL_freebayes"].mean(), inplace=True)
df_merged_with_truth["QUAL_vardict"].fillna(df_merged_with_truth["QUAL_vardict"].mean(), inplace=True)

In [34]:
df_merged_with_truth["QUAL_mutect"].isna().sum()

2003776

In [35]:
df_merged_with_truth[["QUAL_freebayes", "QUAL_vardict"]].corr()

Unnamed: 0,QUAL_freebayes,QUAL_vardict
QUAL_freebayes,1.0,0.649869
QUAL_vardict,0.649869,1.0


#### Confusion matrix comparing against truth

In [36]:
from sklearn.metrics import confusion_matrix

In [37]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_freebayes)).ravel()
tn, fp, fn, tp

(1999708, 3605, 91, 372)

In [38]:
# least false positives
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_mutect)).ravel()
tn, fp, fn, tp

(1999932, 3381, 25, 438)

In [39]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_vardict)).ravel()
tn, fp, fn, tp

(1993064, 10249, 82, 381)

In [40]:
tn, fp, fn, tp = confusion_matrix(list(df_merged_with_truth.truth), list(df_merged_with_truth.FILTER_PASS_varscan)).ravel()
tn, fp, fn, tp

(1985391, 17922, 339, 124)

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_merged_with_truth[["FILTER_PASS_mutect", "FILTER_PASS_freebayes", "FILTER_PASS_vardict", "FILTER_PASS_varscan"]], df_merged_with_truth["truth"], test_size = 0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1603020, 4)
(400756, 4)
(1603020,)
(400756,)


In [42]:
from pgmpy.models import BayesianNetwork

In [43]:
bn = BayesianNetwork(
[
#     ("CHROM", "truth"),
#     ("POS", "truth"),
#     ("QUAL_freebayes", "truth"),
#     ("QUAL_vardict", "truth"),
    ("FILTER_PASS_mutect", "truth"),
    ("FILTER_PASS_freebayes", "truth"),
    ("FILTER_PASS_vardict", "truth"),
    ("FILTER_PASS_varscan", "truth"),
])

In [44]:
from pgmpy.estimators import MaximumLikelihoodEstimator

In [45]:
train_df = pd.DataFrame(X_train, columns = ["FILTER_PASS_mutect","FILTER_PASS_freebayes", "FILTER_PASS_vardict", "FILTER_PASS_varscan"])
train_df["truth"] = y_train
print(train_df.shape)
train_df.head()

(1603020, 5)


Unnamed: 0,FILTER_PASS_mutect,FILTER_PASS_freebayes,FILTER_PASS_vardict,FILTER_PASS_varscan,truth
750498,False,False,False,False,False
1683615,False,False,False,False,False
179412,False,False,False,False,False
1221789,False,False,False,False,False
991916,False,False,False,False,False


In [46]:
bn.fit(
    data=train_df,
    estimator=MaximumLikelihoodEstimator
)

In [47]:
print(bn.get_cpds("truth"))

+-----------------------+-----+-----------------------------+
| FILTER_PASS_freebayes | ... | FILTER_PASS_freebayes(True) |
+-----------------------+-----+-----------------------------+
| FILTER_PASS_mutect    | ... | FILTER_PASS_mutect(True)    |
+-----------------------+-----+-----------------------------+
| FILTER_PASS_vardict   | ... | FILTER_PASS_vardict(True)   |
+-----------------------+-----+-----------------------------+
| FILTER_PASS_varscan   | ... | FILTER_PASS_varscan(True)   |
+-----------------------+-----+-----------------------------+
| truth(False)          | ... | 0.33064516129032256         |
+-----------------------+-----+-----------------------------+
| truth(True)           | ... | 0.6693548387096774          |
+-----------------------+-----+-----------------------------+


In [48]:
test_df = pd.DataFrame(X_test, columns = ["FILTER_PASS_mutect","FILTER_PASS_freebayes", "FILTER_PASS_vardict", "FILTER_PASS_varscan"])
test_df["truth"] = y_test
print(test_df.shape)
test_df.head()

(400756, 5)


Unnamed: 0,FILTER_PASS_mutect,FILTER_PASS_freebayes,FILTER_PASS_vardict,FILTER_PASS_varscan,truth
1502033,False,False,False,False,False
97401,False,False,False,False,False
357617,False,False,False,False,False
126488,False,False,False,False,False
1560417,False,False,False,False,False


In [49]:
from pgmpy.inference import VariableElimination
from ipywidgets import FloatProgress

In [50]:
bn_infer = VariableElimination(bn)

In [51]:
from IPython.display import clear_output

In [52]:
def get_preds(row):
    prob = bn_infer.query(variables=["truth"], evidence={"FILTER_PASS_freebayes": row["FILTER_PASS_freebayes"], \
                                             "FILTER_PASS_mutect": row["FILTER_PASS_mutect"], \
                                             "FILTER_PASS_vardict": row["FILTER_PASS_vardict"], \
                                             "FILTER_PASS_varscan": row["FILTER_PASS_varscan"]}, show_progress=False)
#     clear_output(wait=True)
    if (prob.values[1]) > 0.5:
        return True
    else:
        return False

In [53]:
test_df["preds"] = test_df.apply(get_preds, axis = 1)

#### Metric evaluation

In [54]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [55]:
from sklearn.metrics import classification_report

In [56]:
print(classification_report(test_df["truth"], test_df["preds"]))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00    400673
        True       0.73      0.84      0.78        83

    accuracy                           1.00    400756
   macro avg       0.86      0.92      0.89    400756
weighted avg       1.00      1.00      1.00    400756



#### Predict on real2_part2

In [57]:
vcf_folder = "../dataset/"
vcf_file_freebayes = "real2_freebayes_rest.vcf.gz"
vcf_file_mutect = "real2_mutect_rest.vcf.gz"
vcf_file_vardict = "real2_vardict_rest.vcf.gz"
vcf_file_varscan = "real2_varscan_rest.vcf.gz"

df_freebayes = allel.vcf_to_dataframe(vcf_folder + vcf_file_freebayes)
print(f"Real freebayes shape = {df_freebayes.shape}")

df_mutect = allel.vcf_to_dataframe(vcf_folder + vcf_file_mutect)
print(f"Real Mutect shape = {df_mutect.shape}")

df_vardict = allel.vcf_to_dataframe(vcf_folder + vcf_file_vardict)
print(f"Real Vardict shape = {df_vardict.shape}")

df_varscan = allel.vcf_to_dataframe(vcf_folder + vcf_file_varscan)
print(f"Real Varscan shape = {df_varscan.shape}")

m1 = pd.merge(df_freebayes, df_mutect, on = ["CHROM", "POS"], how="outer", suffixes = ("_freebayes", "_mutect"))

m2 = pd.merge(m1, df_vardict, on = ["CHROM", "POS"], how="outer")
m2.rename(columns={"ID": "ID_vardict", "REF": "REF_vardict", "ALT_1": "ALT_1_vardict", "ALT_2": "ALT_2_vardict", \
                  "ALT_3": "ALT_3_vardict", "QUAL": "QUAL_vardict", "FILTER_PASS": "FILTER_PASS_vardict"}, inplace=True)

m3 = pd.merge(m2, df_varscan, on = ["CHROM", "POS"], how="outer")
m3.rename(columns={"ID": "ID_varscan", "REF": "REF_varscan", "ALT_1": "ALT_1_varscan", "ALT_2": "ALT_2_varscan", \
                  "ALT_3": "ALT_3_varscan", "QUAL": "QUAL_varscan", "FILTER_PASS": "FILTER_PASS_varscan"}, inplace=True)

df_merged = m3

# add third category instead of False?
df_merged["FILTER_PASS_freebayes"].fillna(False, inplace=True)
df_merged["FILTER_PASS_vardict"].fillna(False, inplace=True)
df_merged["FILTER_PASS_mutect"].fillna(False, inplace=True)
df_merged["FILTER_PASS_varscan"].fillna(False, inplace=True)

df_merged_test = df_merged
print(df_merged_test.shape)
df_merged_test.head()

Real freebayes shape = (3202219, 9)
Real Mutect shape = (134415, 9)
Real Vardict shape = (3234298, 9)
Real Varscan shape = (3327378, 9)
(3846560, 30)


Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,ALT_3_vardict,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan
0,6,69321,.,G,A,,,18.9,True,,...,,58.0,False,.,G,A,,,0.0,False
1,6,73924,.,AAGAGAAAG,AAAAG,A,,44.700001,False,,...,,125.0,False,rs201634483,AAGAG,A,,,0.0,False
2,6,73928,rs11242695,G,A,,,44.700001,False,,...,,,False,,,,,,,False
3,6,86583,.,G,A,,,152.699997,False,,...,,123.0,False,,,,,,,False
4,6,100908,.,C,T,,,15.2,False,,...,,81.0,False,.,C,T,,,0.0,False


In [58]:
df_merged_test["preds"] = df_merged_test.apply(get_preds, axis = 1)

In [59]:
df_merged_test["preds"].value_counts()

False    3845771
True         789
Name: preds, dtype: int64

In [60]:
df_merged_test.head()

Unnamed: 0,CHROM,POS,ID_freebayes,REF_freebayes,ALT_1_freebayes,ALT_2_freebayes,ALT_3_freebayes,QUAL_freebayes,FILTER_PASS_freebayes,ID_mutect,...,QUAL_vardict,FILTER_PASS_vardict,ID_varscan,REF_varscan,ALT_1_varscan,ALT_2_varscan,ALT_3_varscan,QUAL_varscan,FILTER_PASS_varscan,preds
0,6,69321,.,G,A,,,18.9,True,,...,58.0,False,.,G,A,,,0.0,False,False
1,6,73924,.,AAGAGAAAG,AAAAG,A,,44.700001,False,,...,125.0,False,rs201634483,AAGAG,A,,,0.0,False,False
2,6,73928,rs11242695,G,A,,,44.700001,False,,...,,False,,,,,,,False,False
3,6,86583,.,G,A,,,152.699997,False,,...,123.0,False,,,,,,,False,False
4,6,100908,.,C,T,,,15.2,False,,...,81.0,False,.,C,T,,,0.0,False,False


In [61]:
bed_file_name = "../dataset/real2_part2_PGM.bed"

real2_preds_df = df_merged_test[df_merged_test["preds"] == True][["CHROM", "POS"]]

real2_preds_df["POS_END"] = real2_preds_df["POS"]

real2_preds_df.columns = ['CHROM', 'POS_START', 'POS_END']
print(real2_preds_df.shape)
print(real2_preds_df.head())

real2_preds_df.to_csv(bed_file_name, sep="\t", index=False, header=False)

(789, 3)
      CHROM  POS_START   POS_END
14023     6    6524419   6524419
15375     6    7163472   7163472
15437     6    7255456   7255456
16010     6    7509535   7509535
24774     6   12542457  12542457
