# Classification Approach

## 1 Imports and Config

### 1.1 Imports

In [1]:
!pip install scikit-allel

import allel

Collecting scikit-allel
  Downloading scikit_allel-1.3.5-cp37-cp37m-manylinux2010_x86_64.whl (5.7 MB)
[K     |████████████████████████████████| 5.7 MB 5.6 MB/s 
Installing collected packages: scikit-allel
Successfully installed scikit-allel-1.3.5


In [2]:
import os
import pickle

import json
import functools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy

from google.colab import drive

from sklearn.svm import SVC
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
os.listdir('gdrive/MyDrive/CS4220_Data')

['real1-freebayes.vcf.gz',
 'real1-mutect2.vcf.gz',
 'real1_truth.bed',
 'real1-vardict.vcf.gz',
 'real1-varscan.vcf.gz',
 'syn1_truth.bed',
 'syn1-mutect2.vcf.gz',
 'syn1-freebayes.vcf.gz',
 'syn1-varscan.vcf.gz',
 'syn1-vardict.vcf.gz',
 'syn2_truth.bed',
 'syn2-freebayes.vcf.gz',
 'syn2-mutect2.vcf.gz',
 'syn2-vardict.vcf.gz',
 'syn2-varscan.vcf.gz',
 'syn3_truth.bed',
 'syn3-mutect2.vcf.gz',
 'syn3-freebayes.vcf.gz',
 'syn3-varscan.vcf.gz',
 'syn3-vardict.vcf.gz',
 'syn4_truth.bed',
 'syn4-mutect2.vcf.gz',
 'syn4-freebayes.vcf.gz',
 'syn4-varscan.vcf.gz',
 'syn5_truth.bed',
 'syn4-vardict.vcf.gz',
 'syn5-mutect2.vcf.gz',
 'syn5-freebayes.vcf.gz',
 'syn5-vardict.vcf.gz',
 'syn5-varscan.vcf.gz',
 'real2-mutect2.vcf.gz',
 'real2-freebayes.vcf.gz',
 'real2_truth.bed',
 'real2-varscan.vcf.gz',
 'real2-vardict.vcf.gz',
 'real3-freebayes.vcf.gz',
 'real3-varscan.vcf.gz',
 'real3-vardict.vcf.gz',
 'real3-mutect2.vcf.gz',
 'real3_test.bed',
 'real1-freebayes.csv',
 'real1-mutect2.csv',
 're

### 1.2 Config

In [5]:
HOME = 'gdrive/MyDrive/CS4220_Data'

In [6]:
datasets = ['real1', 'real2', 'real3', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5']
methods = ['freebayes', 'mutect2', 'varscan', 'vardict']
dfs = {}

In [7]:
cols = {'freebayes': ['QUAL','SOMATIC','FILTER_REJECT','QR','DP','FILTER_PASS','SRF'], #'QA_1','SAF_1','SAR_1'
        'mutect2': ['TLOD','FILTER_PASS','MQ','DP','NLOD','is_snp','FILTER_t_lod_fstar','FS','ReadPosRankSum','MQRankSum'],
        'varscan': ['SPV','DP','SSC','is_snp','SOMATIC','FILTER_REJECT','FILTER_SpvFreq','FILTER_PASS','SS'], #'altlen_1'
        'vardict': ['MSI','QUAL','SSF','DP','VD','FILTER_PASS','SOR','is_snp','SHIFT3'] #'AF_1'
        }

In [8]:
chroms = [str(x) for x in range(24)] + ['X', 'Y']

## 2. Get data from files

Skip 2.1 and 2.2 if you already have the pickle file.

### 2.1 Load data to dataframe

In [9]:
combined_df = None
for dataset in ['syn4']:
    temp_dfs = {}
    for method in methods:
        temp_df = allel.vcf_to_dataframe(HOME+'/'+dataset+'-'+method+'.vcf.gz', fields=cols[method]+['CHROM', 'POS'])
        temp_df['FILTER_PASS'] = temp_df['FILTER_PASS'].fillna(-1).astype(int)
        temp_df.columns = [(f'{method}_{col}' if col not in ['CHROM', 'POS'] else col) for col in temp_df.columns]
        print(f'Read {method} from {dataset}')
        temp_dfs[method] = temp_df
        del temp_df
    merged = temp_dfs['freebayes'].merge(temp_dfs['mutect2'], 
                                                   how='outer', on=['CHROM', 'POS']).merge(temp_dfs['vardict'], 
                                                   how='outer', on=['CHROM', 'POS']).merge(temp_dfs['varscan'], 
                                                   how='outer', on=['CHROM', 'POS'])
    del temp_dfs
    combined_df = None
    if(dataset == 'real3'):
        combined_df = merged
    else:
        truth_df = pd.read_csv(HOME+'/'+dataset+'_truth.bed', delimiter='\t', header=0, names=['CHROM', 'START_POS', 'END_POS'])
        if(dataset == 'real2'):
            combined_df = merged.merge(truth_df[['CHROM', 'START_POS']].astype({'CHROM': 'object'}), how='left', left_on=['CHROM', 'POS'], right_on=['CHROM', 'START_POS'])
        else:
            combined_df = merged.merge(truth_df[['CHROM', 'START_POS']], how='left', left_on=['CHROM', 'POS'], right_on=['CHROM', 'START_POS'])
        combined_df = combined_df[combined_df['CHROM'].isin(chroms)]
        combined_df['y'] = combined_df['START_POS'].notna().astype(int)
        combined_df.drop(['START_POS', 'CHROM', 'POS'], axis=1, inplace = True)
    dfs[dataset] = combined_df
    del combined_df

Read freebayes from syn4
Read mutect2 from syn4
Read varscan from syn4
Read vardict from syn4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
data = dfs['syn4']
data.head()

Unnamed: 0,freebayes_QUAL,freebayes_SOMATIC,freebayes_FILTER_REJECT,freebayes_QR,freebayes_DP,freebayes_FILTER_PASS,freebayes_SRF,mutect2_TLOD,mutect2_FILTER_PASS,mutect2_MQ,...,varscan_SPV,varscan_DP,varscan_SSC,varscan_is_snp,varscan_SOMATIC,varscan_FILTER_REJECT,varscan_FILTER_SpvFreq,varscan_FILTER_PASS,varscan_SS,y
0,57.5,False,True,-1.0,-1.0,0.0,-1.0,,,,...,,,,,,,,,,0
1,57.5,False,True,-1.0,-1.0,0.0,-1.0,,,,...,,,,,,,,,,0
2,96.699997,False,True,-1.0,-1.0,0.0,-1.0,,,,...,0.7709,22.0,1.0,False,False,True,False,0.0,1.0,0
3,96.699997,False,True,-1.0,-1.0,0.0,-1.0,,,,...,0.95305,15.0,0.0,True,False,True,False,0.0,1.0,0
4,589.200012,False,True,528.0,45.0,0.0,8.0,,,,...,0.84384,36.0,0.0,True,False,True,False,0.0,1.0,0


In [11]:
X =  data.loc[:, (data.columns != 'y')]
y = data['y']

In [12]:
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, stratify = y)
    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    print(y_train.value_counts().to_dict())
    print(y_test.value_counts().to_dict())
    return(X_train, X_test, y_train, y_test)

In [14]:
pickle_data = {'X': X, 'y': y}
with open(HOME+'/syn4_data.pkl', 'wb') as fp:
    pickle.dump(pickle_data, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
with open(HOME+'/syn4_data.pkl', 'rb') as fp:
    obj = pickle.load(fp)
    X,y = obj['X'], obj['y']

In [16]:
X

Unnamed: 0,freebayes_QUAL,freebayes_SOMATIC,freebayes_FILTER_REJECT,freebayes_QR,freebayes_DP,freebayes_FILTER_PASS,freebayes_SRF,mutect2_TLOD,mutect2_FILTER_PASS,mutect2_MQ,...,vardict_SHIFT3,varscan_SPV,varscan_DP,varscan_SSC,varscan_is_snp,varscan_SOMATIC,varscan_FILTER_REJECT,varscan_FILTER_SpvFreq,varscan_FILTER_PASS,varscan_SS
0,57.500000,False,True,-1.0,-1.0,0.0,-1.0,,,,...,,,,,,,,,,
1,57.500000,False,True,-1.0,-1.0,0.0,-1.0,,,,...,,,,,,,,,,
2,96.699997,False,True,-1.0,-1.0,0.0,-1.0,,,,...,0.0,0.77090,22.0,1.0,False,False,True,False,0.0,1
3,96.699997,False,True,-1.0,-1.0,0.0,-1.0,,,,...,2.0,0.95305,15.0,0.0,True,False,True,False,0.0,1
4,589.200012,False,True,528.0,45.0,0.0,8.0,,,,...,0.0,0.84384,36.0,0.0,True,False,True,False,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5043595,,,,,,,,,,,...,,0.75814,120.0,1.0,True,False,True,False,0.0,1
5043596,,,,,,,,,,,...,,0.79563,102.0,0.0,True,False,True,False,0.0,1
5043597,,,,,,,,,,,...,,0.26755,118.0,5.0,True,False,True,False,0.0,1
5043598,,,,,,,,,,,...,,0.77946,164.0,1.0,True,False,True,False,0.0,1


In [17]:
for col in X.columns:
  if(np.issubdtype(X[col].dtype,np.number) or (col in ['mutect2_TLOD', 'mutect2_NLOD'])):
    pass
  else:
    X[col] = X[col].fillna(-1).astype(int)

In [18]:
# imputer = SimpleImputer(strategy='constant', fill_value=0)
# X = imputer.fit_transform(X)
X.replace([np.inf, -np.inf], np.nan, inplace = True)
X.fillna(0, inplace = True)
X.head()

Unnamed: 0,freebayes_QUAL,freebayes_SOMATIC,freebayes_FILTER_REJECT,freebayes_QR,freebayes_DP,freebayes_FILTER_PASS,freebayes_SRF,mutect2_TLOD,mutect2_FILTER_PASS,mutect2_MQ,...,vardict_SHIFT3,varscan_SPV,varscan_DP,varscan_SSC,varscan_is_snp,varscan_SOMATIC,varscan_FILTER_REJECT,varscan_FILTER_SpvFreq,varscan_FILTER_PASS,varscan_SS
0,57.5,0,1,-1.0,-1.0,0.0,-1.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1,-1,-1,-1,0.0,-1
1,57.5,0,1,-1.0,-1.0,0.0,-1.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1,-1,-1,-1,0.0,-1
2,96.699997,0,1,-1.0,-1.0,0.0,-1.0,0,0.0,0.0,...,0.0,0.7709,22.0,1.0,0,0,1,0,0.0,1
3,96.699997,0,1,-1.0,-1.0,0.0,-1.0,0,0.0,0.0,...,2.0,0.95305,15.0,0.0,1,0,1,0,0.0,1
4,589.200012,0,1,528.0,45.0,0.0,8.0,0,0.0,0.0,...,0.0,0.84384,36.0,0.0,1,0,1,0,0.0,1


In [19]:
clf = RandomForestClassifier(random_state=42)
# clf = GradientBoostingClassifier(random_state=42)
# clf = SVC()

In [20]:
X_train, X_test, y_train, y_test = split_data(X, y)

(3727722, 35)
(1242574, 35)
(3727722,)
(1242574,)
{0: 3716179, 1: 11543}
{0: 1238726, 1: 3848}


In [21]:
clf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(random_state=42)

In [22]:
y_pred = clf.predict(X_test)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9704192992533027
0.8781185031185031
0.9219645293315144
