# Classification Approach

## 1 Imports and Config

### 1.1 Imports

In [None]:
!pip install scikit-allel

import allel



In [None]:
import os
import pickle

import json
import functools
import numpy as np
import pandas as pd
from scipy.stats import mode
import matplotlib.pyplot as plt

import scipy

from google.colab import drive

from sklearn.svm import SVC
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [None]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
os.listdir('gdrive/MyDrive/CS4220_Data')

['real1-freebayes.vcf.gz',
 'real1-mutect2.vcf.gz',
 'real1_truth.bed',
 'real1-vardict.vcf.gz',
 'real1-varscan.vcf.gz',
 'syn1_truth.bed',
 'syn1-mutect2.vcf.gz',
 'syn1-freebayes.vcf.gz',
 'syn1-varscan.vcf.gz',
 'syn1-vardict.vcf.gz',
 'syn2_truth.bed',
 'syn2-freebayes.vcf.gz',
 'syn2-mutect2.vcf.gz',
 'syn2-vardict.vcf.gz',
 'syn2-varscan.vcf.gz',
 'syn3_truth.bed',
 'syn3-mutect2.vcf.gz',
 'syn3-freebayes.vcf.gz',
 'syn3-varscan.vcf.gz',
 'syn3-vardict.vcf.gz',
 'syn4_truth.bed',
 'syn4-mutect2.vcf.gz',
 'syn4-freebayes.vcf.gz',
 'syn4-varscan.vcf.gz',
 'syn5_truth.bed',
 'syn4-vardict.vcf.gz',
 'syn5-mutect2.vcf.gz',
 'syn5-freebayes.vcf.gz',
 'syn5-vardict.vcf.gz',
 'syn5-varscan.vcf.gz',
 'real2-mutect2.vcf.gz',
 'real2-freebayes.vcf.gz',
 'real2_truth.bed',
 'real2-varscan.vcf.gz',
 'real2-vardict.vcf.gz',
 'real3-freebayes.vcf.gz',
 'real3-varscan.vcf.gz',
 'real3-vardict.vcf.gz',
 'real3-mutect2.vcf.gz',
 'real3_test.bed',
 'real1-freebayes.csv',
 'real1-mutect2.csv',
 're

### 1.2 Config

In [None]:
HOME = 'gdrive/MyDrive/CS4220_Data'

In [None]:
datasets = ['real1', 'real2', 'real3', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5']
methods = ['freebayes', 'mutect2', 'varscan', 'vardict']
dfs = {}

In [None]:
cols = {'freebayes': ['QUAL','SOMATIC','FILTER_REJECT','QR','DP','FILTER_PASS','SRF'], #'QA_1','SAF_1','SAR_1'
        'mutect2': ['TLOD','FILTER_PASS','MQ','DP','NLOD','is_snp','FILTER_t_lod_fstar','FS','ReadPosRankSum','MQRankSum'],
        'varscan': ['SPV','DP','SSC','is_snp','SOMATIC','FILTER_REJECT','FILTER_SpvFreq','FILTER_PASS','SS'], #'altlen_1'
        'vardict': ['MSI','QUAL','SSF','DP','VD','FILTER_PASS','SOR','is_snp','SHIFT3'] #'AF_1'
        }

In [None]:
chroms = [str(x) for x in range(24)] + ['X', 'Y']

## 2 Get data from files

Skip 2.1 and 2.2 if you already have the pickle file.

### 2.1 Load data to dataframe

In [None]:
combined_df = None
for dataset in ['real3']:
    temp_dfs = {}
    for method in methods:
        temp_df = allel.vcf_to_dataframe(HOME+'/'+dataset+'-'+method+'.vcf.gz', fields=cols[method]+['CHROM', 'POS'])
        temp_df['FILTER_PASS'] = temp_df['FILTER_PASS'].fillna(-1).astype(int)
        temp_df.columns = [(f'{method}_{col}' if col not in ['CHROM', 'POS'] else col) for col in temp_df.columns]
        print(f'Read {method} from {dataset}')
        temp_dfs[method] = temp_df
        del temp_df
    merged = temp_dfs['freebayes'].merge(temp_dfs['mutect2'], 
                                                   how='outer', on=['CHROM', 'POS']).merge(temp_dfs['vardict'], 
                                                   how='outer', on=['CHROM', 'POS']).merge(temp_dfs['varscan'], 
                                                   how='outer', on=['CHROM', 'POS'])
    del temp_dfs
    combined_df = None
    if(dataset == 'real3'):
        combined_df = merged
    else:
        truth_df = pd.read_csv(HOME+'/'+dataset+'_truth.bed', delimiter='\t', header=0, names=['CHROM', 'START_POS', 'END_POS'])
        if(dataset == 'real2'):
            combined_df = merged.astype({'CHROM': 'int64'}).merge(truth_df[['CHROM', 'START_POS']], how='left', left_on=['CHROM', 'POS'], right_on=['CHROM', 'START_POS'])
            print(combined_df[combined_df['POS'] == 9414323]['CHROM'])
        else:
            combined_df = merged.merge(truth_df[['CHROM', 'START_POS']], how='left', left_on=['CHROM', 'POS'], right_on=['CHROM', 'START_POS'])
            combined_df = combined_df[combined_df['CHROM'].isin(chroms)]
        combined_df['y'] = combined_df['START_POS'].notna().astype(int)
        combined_df.drop(['START_POS', 'CHROM', 'POS'], axis=1, inplace = True)
    dfs[dataset] = combined_df
    del combined_df

Read freebayes from real3
Read mutect2 from real3
Read varscan from real3
Read vardict from real3


In [None]:
data = dfs['real3']
data.head()

Unnamed: 0,freebayes_QUAL,freebayes_SOMATIC,freebayes_FILTER_REJECT,freebayes_QR,freebayes_DP,freebayes_FILTER_PASS,freebayes_SRF,CHROM,POS,mutect2_TLOD,...,vardict_SHIFT3,varscan_SPV,varscan_DP,varscan_SSC,varscan_is_snp,varscan_SOMATIC,varscan_FILTER_REJECT,varscan_FILTER_SpvFreq,varscan_FILTER_PASS,varscan_SS
0,18.9,True,False,111.0,7.0,1.0,3.0,6,69321,,...,0.0,0.18164,146.0,7.0,True,False,True,False,0.0,1.0
1,44.700001,False,True,-1.0,-1.0,0.0,-1.0,6,73924,,...,2.0,0.56078,24.0,2.0,False,False,True,False,0.0,1.0
2,44.700001,False,True,-1.0,-1.0,0.0,-1.0,6,73928,,...,,,,,,,,,,
3,152.699997,False,True,32.0,14.0,0.0,1.0,6,86583,,...,0.0,,,,,,,,,
4,15.2,False,True,173.0,11.0,0.0,5.0,6,100908,,...,1.0,0.12567,113.0,9.0,True,False,True,False,0.0,1.0


In [None]:
data['y'].value_counts()

0          6
1          6
2          6
3          6
4          6
          ..
3846555    Y
3846556    Y
3846557    Y
3846558    Y
3846559    Y
Name: CHROM, Length: 3846560, dtype: object

In [None]:
X =  data.loc[:, (data.columns != 'y')]
y = data['y'] if 'y' in data.columns else None

### 2.2 Save/Load Data from Pickle

In [None]:
pickle_data = {'X': X, 'y': y}
with open(HOME+'/real3_data.pkl', 'wb') as fp:
    pickle.dump(pickle_data, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(HOME+'/real3_data.pkl', 'rb') as fp:
    obj = pickle.load(fp)
    X,y = obj['X'], obj['y']

In [None]:
naive_features = ['freebayes_FILTER_PASS', 'mutect2_FILTER_PASS', 'varscan_FILTER_PASS', 'vardict_FILTER_PASS']

In [None]:
imp_features = ['mutect2_TLOD', 'freebayes_SOMATIC', 'mutect2_FILTER_PASS', 'varscan_SSC', 'mutect2_MQ', 'freebayes_QR', 'mutect2_DP','mutect2_NLOD', 'vardict_FILTER_PASS', 'freebayes_DP', 'freebayes_SRF', 'varscan_FILTER_SpvFreq', 'freebayes_QUAL', 'mutect2_FILTER_t_lod_fstar', 'freebayes_FILTER_PASS', 'mutect2_ReadPosRankSum']

In [None]:
# X = X[imp_features]
X

Unnamed: 0,freebayes_QUAL,freebayes_SOMATIC,freebayes_FILTER_REJECT,freebayes_QR,freebayes_DP,freebayes_FILTER_PASS,freebayes_SRF,CHROM,POS,mutect2_TLOD,...,vardict_SHIFT3,varscan_SPV,varscan_DP,varscan_SSC,varscan_is_snp,varscan_SOMATIC,varscan_FILTER_REJECT,varscan_FILTER_SpvFreq,varscan_FILTER_PASS,varscan_SS
0,18.900000,1,0,111.0,7.0,1.0,3.0,6,69321,,...,0.0,0.181640,146.0,7.0,True,False,True,False,0.0,1
1,44.700001,0,1,-1.0,-1.0,0.0,-1.0,6,73924,,...,2.0,0.560780,24.0,2.0,False,False,True,False,0.0,1
2,44.700001,0,1,-1.0,-1.0,0.0,-1.0,6,73928,,...,,,,,,,,,,
3,152.699997,0,1,32.0,14.0,0.0,1.0,6,86583,,...,0.0,,,,,,,,,
4,15.200000,0,1,173.0,11.0,0.0,5.0,6,100908,,...,1.0,0.125670,113.0,9.0,True,False,True,False,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3846555,,-1,-1,,,,,Y,59032000,,...,,0.200800,546.0,6.0,True,False,True,False,0.0,1
3846556,,-1,-1,,,,,Y,59032521,,...,,0.000190,383.0,37.0,True,False,True,False,0.0,3
3846557,,-1,-1,,,,,Y,59032848,,...,,0.110420,565.0,9.0,True,False,True,False,0.0,1
3846558,,-1,-1,,,,,Y,59032871,,...,,0.099464,668.0,10.0,True,False,True,False,0.0,1


## 3 Preprocessing

### 3.1 Convert binary columns

In [None]:
for col in X.columns:
  if(np.issubdtype(X[col].dtype,np.number) or (col in ['mutect2_TLOD', 'mutect2_NLOD', 'CHROM'])):
    pass
  else:
    X[col] = X[col].fillna(-1).astype(int)

### 3.2 Remove Inf and NaNs

In [None]:
# imputer = SimpleImputer(strategy='constant', fill_value=0)
# X = imputer.fit_transform(X)
X.replace([np.inf, -np.inf], np.nan, inplace = True)
X.fillna(0, inplace = True)
X.head()

Unnamed: 0,freebayes_QUAL,freebayes_SOMATIC,freebayes_FILTER_REJECT,freebayes_QR,freebayes_DP,freebayes_FILTER_PASS,freebayes_SRF,CHROM,POS,mutect2_TLOD,...,vardict_SHIFT3,varscan_SPV,varscan_DP,varscan_SSC,varscan_is_snp,varscan_SOMATIC,varscan_FILTER_REJECT,varscan_FILTER_SpvFreq,varscan_FILTER_PASS,varscan_SS
0,18.9,1,0,111.0,7.0,1.0,3.0,6,69321,0,...,0.0,0.18164,146.0,7.0,1,0,1,0,0.0,1
1,44.700001,0,1,-1.0,-1.0,0.0,-1.0,6,73924,0,...,2.0,0.56078,24.0,2.0,0,0,1,0,0.0,1
2,44.700001,0,1,-1.0,-1.0,0.0,-1.0,6,73928,0,...,0.0,0.0,0.0,0.0,-1,-1,-1,-1,0.0,-1
3,152.699997,0,1,32.0,14.0,0.0,1.0,6,86583,0,...,0.0,0.0,0.0,0.0,-1,-1,-1,-1,0.0,-1
4,15.2,0,1,173.0,11.0,0.0,5.0,6,100908,0,...,1.0,0.12567,113.0,9.0,1,0,1,0,0.0,1


## 4 Classification Model

### 4.1 Get Classification Model

In [None]:
clf = RandomForestClassifier(random_state=42, verbose=1)
# clf = SVC(verbose=True)

### 4.2 Split data to train/test

In [None]:
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, stratify = y)
    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    print(y_train.value_counts().to_dict())
    print(y_test.value_counts().to_dict())
    return(X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = split_data(X, y)

(1502832, 16)
(500944, 16)
(1502832,)
(500944,)
{0: 1502486, 1: 346}
{0: 500828, 1: 116}


### 4.3 Fit Model

In [None]:
clf.fit(X_train, y_train.values.ravel())

[LibSVM]

### 4.4 Save/load Model from Pickle

In [None]:
with open(HOME+'/syn1_svc.pkl', 'wb') as fp:
    pickle.dump(clf, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
clf = None
with open(HOME+'/real1_rf.pkl', 'rb') as fp:
    clf = pickle.load(fp)
clf

RandomForestClassifier(random_state=42)

### 4.5 Predict

In [None]:
#If on held out set.
y_pred = clf.predict(X_test)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9504950495049505
0.8275862068965517
0.8847926267281105


In [None]:
#If on full data
y_pred = clf.predict(X)
print(f1_score(y, y_pred))
print(precision_score(y, y_pred))
print(recall_score(y, y_pred))

print(classification_report(y, y_pred))

0.8810365135453475
0.9664082687338501
0.8095238095238095
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2003314
           1       0.97      0.81      0.88       462

    accuracy                           1.00   2003776
   macro avg       0.98      0.90      0.94   2003776
weighted avg       1.00      1.00      1.00   2003776



In [None]:
y_pred = clf.predict(X[imp_features])

In [None]:
res = X[['CHROM', 'POS', 'POS']]
res['y_pred'] = y_pred
res = res[res['y_pred'] == 1]
res.drop(['y_pred'], axis=1, inplace=True)
res.columns = ['CHROM', 'START_POS', 'END_POS']
res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,CHROM,START_POS,END_POS
14023,6,6524419,6524419
15437,6,7255456,7255456
16010,6,7509535,7509535
27287,6,14028517,14028517
31182,6,16613413,16613413
...,...,...,...
3264954,14,43477063,43477063
3266117,14,85892279,85892279
3301109,19,45132839,45132839
3328223,Y,16587909,16587909


In [None]:
res.to_csv('real2_part2_predict.bed', header=False, sep=' ', index=False)