# Classification Approach

## 1 Imports and Config

### 1.1 Imports

In [65]:
import os
import allel
import pickle

import json
import functools
import numpy as np
import pandas as pd

import scipy

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

### 1.2 Config

In [2]:
HOME = "cs4220/ass1"
os.listdir()

['syn2',
 'syn1.zip',
 'real2_part2',
 'syn5',
 'syn1',
 'real2_part1',
 'Notebook_1.ipynb',
 'test',
 'real1.zip',
 'real2_part2.zip',
 'real2_part1.zip',
 'syn4',
 'test.zip',
 'syn3',
 'real1',
 '.ipynb_checkpoints',
 'syn4.zip',
 'syn3.zip',
 'syn2.zip',
 'Classification.ipynb',
 'syn5.zip']

In [16]:
datasets = ['real1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5']
methods = ['freebayes', 'mutect2', 'varscan', 'vardict']
dfs = {}

## 2. Get data from files

Skip 2.1 and 2.2 if you already have the pickle file.

### 2.1 Load data to dataframe

In [17]:
for dataset in datasets:
    temp_dfs = {}
    for method in methods:
        temp_df = allel.vcf_to_dataframe(dataset+'/'+dataset+'-'+method+'.vcf.gz')
        temp_dfs[method] = temp_df
        print(f'Read {method} from {dataset}')
    merged = temp_dfs['freebayes'][['CHROM', 'POS', 'FILTER_PASS']].merge(temp_dfs['mutect2'][['CHROM', 'POS', 'FILTER_PASS']], 
                                                   how='outer', on=['CHROM', 'POS'], 
                                                   suffixes=['_freebayes', '_mutect2']).merge(temp_dfs['vardict'][['CHROM', 'POS', 'FILTER_PASS']], 
                                                   how='outer', on=['CHROM', 'POS']).merge(temp_dfs['varscan'][['CHROM', 'POS', 'FILTER_PASS']], 
                                                   how='outer', on=['CHROM', 'POS'], suffixes=['_vardict', '_varscan'])
    truth_df = pd.read_csv(dataset+'/'+dataset+'_truth.bed', delimiter='\t', header=0, names=['CHROM', 'START_POS', 'END_POS'])
    combined_df = merged.merge(truth_df[['CHROM', 'START_POS']], how='outer', left_on=['CHROM', 'POS'], right_on=['CHROM', 'START_POS'])
    combined_df['y'] = combined_df['START_POS'].notna()

    combined_df['FILTER_PASS_freebayes'] = combined_df['FILTER_PASS_freebayes'].fillna(-1).astype(int)
    combined_df['FILTER_PASS_mutect2'] = combined_df['FILTER_PASS_mutect2'].fillna(-1).astype(int)
    combined_df['FILTER_PASS_vardict'] = combined_df['FILTER_PASS_vardict'].fillna(-1).astype(int)
    combined_df['FILTER_PASS_varscan'] = combined_df['FILTER_PASS_varscan'].fillna(-1).astype(int)
    combined_df['y_true'] = combined_df['y'].astype(int) 
    dfs[dataset] = combined_df

Read freebayes from real1
Read mutect2 from real1
Read varscan from real1
Read vardict from real1
Read freebayes from syn1
Read mutect2 from syn1
Read varscan from syn1
Read vardict from syn1
Read freebayes from syn2
Read mutect2 from syn2
Read varscan from syn2
Read vardict from syn2
Read freebayes from syn3
Read mutect2 from syn3
Read varscan from syn3
Read vardict from syn3
Read freebayes from syn4
Read mutect2 from syn4
Read varscan from syn4
Read vardict from syn4
Read freebayes from syn5
Read mutect2 from syn5
Read varscan from syn5
Read vardict from syn5


### 2.2 Save to pickle

In [21]:
with open('data.pkl', 'wb') as fp:
    pickle.dump(dfs, fp, protocol=pickle.HIGHEST_PROTOCOL)

### 2.3 Load from pickle

In [22]:
p_data = None
with open('data.pkl', 'rb') as fp:
    p_data = pickle.load(fp)

In [25]:
p_data['real1'].head()

Unnamed: 0,CHROM,POS,FILTER_PASS_freebayes,FILTER_PASS_mutect2,FILTER_PASS_vardict,FILTER_PASS_varscan,START_POS,y,y_true
0,1,10177.0,0,-1,-1,0,,False,0
1,1,10583.0,0,-1,0,0,,False,0
2,1,12783.0,0,-1,0,0,,False,0
3,1,13116.0,0,-1,0,-1,,False,0
4,1,13118.0,0,-1,0,-1,,False,0


In [24]:
p_data.keys()

dict_keys(['real1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'])

## 3. Classification 

### 3.1 Config

In [53]:
split_ratio = 0.75
seed = 42
features = ['FILTER_PASS_freebayes', 'FILTER_PASS_mutect2', 'FILTER_PASS_vardict', 'FILTER_PASS_varscan']
target = ['y_true']
df = p_data['real1']

### 3.2 Data preparation

In [50]:
def split_data(df):
    X_train, X_test, y_train, y_test = train_test_split(p_data['real1'][features], p_data['real1'][target], train_size = split_ratio, stratify = p_data['real1'][target])
    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    print(y_train.value_counts().to_dict())
    print(y_test.value_counts().to_dict())
    return(X_train, X_test, y_train, y_test)

In [51]:
X_train, X_test, y_train, y_test = split_data(df)

(4342672, 4)
(1447558, 4)
(4342672, 1)
(1447558, 1)
{(0,): 4341684, (1,): 988}
{(0,): 1447228, (1,): 330}


### 3.3 Classifier

In [55]:
clf = RandomForestClassifier(random_state=seed)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

  clf.fit(X_train, y_train)


In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1447228
           1       0.83      0.80      0.81       330

    accuracy                           1.00   1447558
   macro avg       0.91      0.90      0.91   1447558
weighted avg       1.00      1.00      1.00   1447558



In [63]:
results = {}
for file in p_data.keys():
    df = p_data[file]
    clf = RandomForestClassifier(random_state=seed)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    results[file] = {}
    results[file]['Precision'] = precision_score(y_test, y_pred)
    results[file]['Recall'] = recall_score(y_test, y_pred)
    results[file]['F1 Score'] = f1_score(y_test, y_pred)
print(results)

  clf.fit(X_train, y_train)
  clf.fit(X_train, y_train)
  clf.fit(X_train, y_train)
  clf.fit(X_train, y_train)
  clf.fit(X_train, y_train)
  clf.fit(X_train, y_train)


{'real1': {'Precision': 0.8275862068965517, 'Recall': 0.8, 'F1 Score': 0.8135593220338982}, 'syn1': {'Precision': 0.8275862068965517, 'Recall': 0.8, 'F1 Score': 0.8135593220338982}, 'syn2': {'Precision': 0.8275862068965517, 'Recall': 0.8, 'F1 Score': 0.8135593220338982}, 'syn3': {'Precision': 0.8275862068965517, 'Recall': 0.8, 'F1 Score': 0.8135593220338982}, 'syn4': {'Precision': 0.8275862068965517, 'Recall': 0.8, 'F1 Score': 0.8135593220338982}, 'syn5': {'Precision': 0.8275862068965517, 'Recall': 0.8, 'F1 Score': 0.8135593220338982}}


In [70]:
results

{'real1': {'Precision': 0.8275862068965517,
  'Recall': 0.8,
  'F1 Score': 0.8135593220338982},
 'syn1': {'Precision': 0.8275862068965517,
  'Recall': 0.8,
  'F1 Score': 0.8135593220338982},
 'syn2': {'Precision': 0.8275862068965517,
  'Recall': 0.8,
  'F1 Score': 0.8135593220338982},
 'syn3': {'Precision': 0.8275862068965517,
  'Recall': 0.8,
  'F1 Score': 0.8135593220338982},
 'syn4': {'Precision': 0.8275862068965517,
  'Recall': 0.8,
  'F1 Score': 0.8135593220338982},
 'syn5': {'Precision': 0.8275862068965517,
  'Recall': 0.8,
  'F1 Score': 0.8135593220338982}}