# Genetic Variant Classifications
---

In [261]:
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy import stats
from imblearn.ensemble import EasyEnsemble
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import Imputer
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
%matplotlib inline

## Import the dataset

In [229]:
df = pd.read_csv('./data/clinvar_conflicting.csv')
df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(65188, 46)

In [230]:
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,...,SIFT,PolyPhen,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62
0,1,955563,G,C,0.0,0.0,0.0,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,,,,,,,0.421,11.39,1.133255,-2.0
1,1,955597,G,T,0.0,0.42418,0.2826,MedGen:CN169374,,not_specified,...,,,,,,,0.421,8.15,0.599088,
2,1,955619,G,C,0.0,0.03475,0.0088,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,,,,,,,0.421,3.288,0.069819,1.0
3,1,957640,C,T,0.0318,0.02016,0.0328,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,,,,,,,0.421,12.56,1.356499,
4,1,976059,C,T,0.0,0.00022,0.001,MedGen:CN169374,,not_specified,...,,,,,,,0.421,17.74,2.234711,


### Imbalanced classes

In [231]:
df['CLASS'].value_counts()

0    48754
1    16434
Name: CLASS, dtype: int64

### Data preperation

In [232]:
X = df.loc[:, ~df.columns.isin(['CLASS'])]
Y = df['CLASS']

In [233]:
# Converting to categorical
convert_cat = []
to_drop = []
unique = None

categorical = X.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    unique = column.nunique()
    print(unique)
    
    # drop cols with too many unique values     
    if unique <= 100:
        convert_cat.append(i)
    else:
        to_drop.append(i)

CHROM
25
REF
866
ALT
458
CLNDISDB
9234
CLNDISDBINCL
48
CLNDN
9260
CLNDNINCL
54
CLNHGVS
65188
CLNSIGINCL
68
CLNVC
7
CLNVI
26289
MC
89
Allele
374
Consequence
48
IMPACT
4
SYMBOL
2328
Feature_type
2
Feature
2369
BIOTYPE
2
EXON
3264
INTRON
1929
cDNA_position
13970
CDS_position
13663
Protein_position
7339
Amino_acids
1262
Codons
2220
BAM_EDIT
2
SIFT
4
PolyPhen
4
MOTIF_NAME
2
HIGH_INF_POS
1


In [234]:
print(f'convert_cat\n------\n{convert_cat} \n')
print(f'to_drop\n------\n{to_drop}')

convert_cat
------
['CHROM', 'CLNDISDBINCL', 'CLNDNINCL', 'CLNSIGINCL', 'CLNVC', 'MC', 'Consequence', 'IMPACT', 'Feature_type', 'BIOTYPE', 'BAM_EDIT', 'SIFT', 'PolyPhen', 'MOTIF_NAME', 'HIGH_INF_POS'] 

to_drop
------
['REF', 'ALT', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNVI', 'Allele', 'SYMBOL', 'Feature', 'EXON', 'INTRON', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons']


In [235]:
# Deleting duplicate rows
X = X.loc[:,~X.columns.duplicated()]

# Get dummies - conver to categroical
# dummy_na=True
X = pd.get_dummies(data=X, columns=convert_cat)
# .fillna(np.nan)

# Drop cols with too many unique values
X = X.loc[:, ~X.columns.isin(to_drop)]

### Finding all NaN rows

In [236]:
nans = lambda X: X[X.isnull().any(axis=1)]
len(nans(X))

65188

Every row in the dataset has a null attribute.  Imputing data wiil be a necessity here.

### Imputing data

In [237]:
# For numerical data, impute using mean values
numerical_rows = X.select_dtypes(include=['float64', 'int64'])
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(numerical_rows)

# Impute our data
X[numerical_rows.columns] = imp.transform(numerical_rows)

In [163]:
# For categorical data, impute using mode values
# categorical_rows = X.select_dtypes(include=['uint8'])
# imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
# imp = imp.fit(categorical_rows)

# Impute our data
# X[categorical_rows.columns] = imp.transform(categorical_rows)

In [164]:
# categorical_rows.columns

In [238]:
nans = lambda X: X[X.isnull().any(axis=1)]
len(nans(X))

0

Now we have zero rows with NaN values.

In [239]:
X

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,SSR,DISTANCE,STRAND,MOTIF_POS,MOTIF_SCORE_CHANGE,...,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,PolyPhen_benign,PolyPhen_possibly_damaging,PolyPhen_probably_damaging,PolyPhen_unknown,MOTIF_NAME_Egr1:MA0341.1,MOTIF_NAME_FOXA1:MA0546.1,HIGH_INF_POS_N
0,955563.0,0.0000,0.00000,0.0000,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0
1,955597.0,0.0000,0.42418,0.2826,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0
2,955619.0,0.0000,0.03475,0.0088,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0
3,957640.0,0.0318,0.02016,0.0328,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0
4,976059.0,0.0000,0.00022,0.0010,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0
5,976554.0,0.0000,0.01494,0.0256,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0
6,976563.0,0.0000,0.00135,0.0098,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0
7,976598.0,0.0000,0.00626,0.0056,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0
8,976629.0,0.0000,0.01004,0.0411,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0
9,976963.0,0.0141,0.00461,0.0126,1.000000,2.298077,825.731481,1.000000,1.0,-0.080,...,0,0,0,0,0,0,0,0,0,0


In [241]:
cols_with_nan = pd.isnull(X).sum() > 0
print(cols_with_nan)
# print(list(cols_with_nan))

POS                                                      False
AF_ESP                                                   False
AF_EXAC                                                  False
AF_TGP                                                   False
ORIGIN                                                   False
SSR                                                      False
DISTANCE                                                 False
STRAND                                                   False
MOTIF_POS                                                False
MOTIF_SCORE_CHANGE                                       False
LoFtool                                                  False
CADD_PHRED                                               False
CADD_RAW                                                 False
BLOSUM62                                                 False
CHROM_1                                                  False
CHROM_2                                                

In [None]:
# X.loc[X['CLNDISDBINCL_nan'] == 1, ["v1_A", "v1_B"]] = np.nan
# del df1["v1_nan"]

## Train / Test splits

In [243]:
# Train splits
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.40, random_state=42, stratify=Y)

In [244]:
# Dev and Test splits
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

## Random Forest

In [242]:
rfc = ensemble.RandomForestClassifier(n_estimators=200, class_weight='balanced', n_jobs=-1)

In [245]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

## Evaluation

In [246]:
# Train accuracy
print(f'Train Score: {rfc.score(X_train, y_train)}')
# Dev accuracy
print(f'Dev Score: {rfc.score(X_dev, y_dev)}')

Train Score: 1.0
Dev Score: 0.7659917165209388


In [247]:
y_pred_train = rfc.predict_proba(X_train)[:,1]
auc_train = roc_auc_score(y_train, y_pred_train)
print('AUC train', auc_train)

AUC train 1.0


In [248]:
y_pred_dev = rfc.predict_proba(X_dev)[:,1]
auc_dev = roc_auc_score(y_dev, y_pred_dev)
print('AUC dev', auc_dev)

AUC dev 0.7783134705833297


In [249]:
dev_pred = rfc.predict(X_dev)
dev_table = pd.crosstab(y_dev, dev_pred, margins=True)
print(pd.crosstab(y_dev, dev_pred))

col_0     0    1
CLASS           
0      9101  650
1      2401  886


## Gradient Boosting

In [256]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 700,
          'max_depth': 3,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_dev = clf.predict(X_dev)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_dev = pd.crosstab(y_dev, predict_dev, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

dev_tI_errors = table_dev.loc[0.0,1.0]/table_dev.loc['All','All']
dev_tII_errors = table_dev.loc[1.0,0.0]/table_dev.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(train_tI_errors, train_tII_errors, dev_tI_errors, dev_tII_errors))

Training set accuracy:
Percent Type I errors: 0.03850480670893843
Percent Type II errors: 0.17061259971364287

Test set accuracy:
Percent Type I errors: 0.04578923147722043
Percent Type II errors: 0.186071483356343


In [257]:
clf.score(X_train, y_train)

0.7908825935774187

In [258]:
clf.score(X_dev, y_dev)

0.7681392851664366

In [265]:
# ee = EasyEnsemble(random_state=0)
# X_resampled, y_resampled = ee.fit_sample(X_train, y_train)
# print(X_resampled.shape)

# rfc_2 = ensemble.RandomForestClassifier(n_estimators=300, class_weight='balanced', max_features='log2', n_jobs=-1)

# for x, y in zip(X_resampled, y_resampled):
#     rfc_2.fit(x, y)

(10, 19720, 374)


In [266]:
# # Train accuracy
# print(f'Train Score: {rfc_2.score(X_train, y_train)}')
# # Dev accuracy
# print(f'Dev Score: {rfc_2.score(X_dev, y_dev)}')

Train Score: 0.8299243199018204
Dev Score: 0.6772511121337629
