# Genetic Variant Classifications
----

In [80]:
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy import stats
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import Imputer
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
%matplotlib inline

In [200]:
# clinvar_conflicting.csv
df = pd.read_csv('./data/clinvar_conflicting.csv')
df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(65188, 46)

In [201]:
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,...,SIFT,PolyPhen,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62
0,1,955563,G,C,0.0,0.0,0.0,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,,,,,,,0.421,11.39,1.133255,-2.0
1,1,955597,G,T,0.0,0.42418,0.2826,MedGen:CN169374,,not_specified,...,,,,,,,0.421,8.15,0.599088,
2,1,955619,G,C,0.0,0.03475,0.0088,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,,,,,,,0.421,3.288,0.069819,1.0
3,1,957640,C,T,0.0318,0.02016,0.0328,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,,,,,,,0.421,12.56,1.356499,
4,1,976059,C,T,0.0,0.00022,0.001,MedGen:CN169374,,not_specified,...,,,,,,,0.421,17.74,2.234711,


In [202]:
df['CLASS'].value_counts()

0    48754
1    16434
Name: CLASS, dtype: int64

In [203]:
X = df.loc[:, ~df.columns.isin(['CLASS'])]
Y = df['CLASS']

In [120]:
# Train splits
# X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.40, random_state=42, stratify=Y)

In [121]:
# Dev and Test splits
# X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

In [122]:
# rfc = ensemble.RandomForestClassifier(n_estimators=100, class_weight='balanced')

In [123]:
# rfc.fit(X_train, y_train)

### Convert to categorical

In [223]:
convert_cat = []
to_drop = []
unique = None

categorical = X.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    unique = column.nunique()
    print(unique)
    
    if unique <= 100:
        convert_cat.append(i)
    else:
        to_drop.append(i)

In [224]:
convert_cat

[]

In [225]:
to_drop

[]

In [220]:
# Get dummies - conver to categroical
X = pd.get_dummies(data=X, columns=convert_cat)

In [221]:
X.shape

(65188, 374)

In [207]:
# Remove origional columns
X = X.loc[:, ~X.columns.isin(to_drop)]

### Train / Test splits

In [385]:
# Train splits
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.40, random_state=42, stratify=Y)

In [386]:
# Dev and Test splits
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

## Random Forrest

In [387]:
rfc = ensemble.RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1)

In [388]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [389]:
# Train accuracy
print(f'Train Score: {rfc.score(X_train, y_train)}')
# Dev accuracy
print(f'Dev Score: {rfc.score(X_dev, y_dev)}')

Train Score: 0.9999232971977909
Dev Score: 0.7646111366774045


### Findaing all nans

In [45]:
# nans = lambda X: X[X.isnull().any(axis=1)]

In [115]:
# nans(X)

In [383]:
cols_with_nan = pd.isnull(X).sum() > 0

In [384]:
cols_with_nan

POS                                                      False
AF_ESP                                                   False
AF_EXAC                                                  False
AF_TGP                                                   False
ORIGIN                                                   False
SSR                                                      False
DISTANCE                                                 False
STRAND                                                   False
MOTIF_POS                                                False
MOTIF_SCORE_CHANGE                                       False
LoFtool                                                  False
CADD_PHRED                                               False
CADD_RAW                                                 False
BLOSUM62                                                 False
CHROM_1                                                  False
CHROM_2                                                

In [232]:
# X['CLNDISDBINCL'].value_counts()
# X['CLNDISDBINCL']

# This finds strings
# X['ORIGIN'].dtype == np.dtype('O')

In [327]:
# X['POS']
# X[X['ORIGIN'].isnull()]

In [74]:
# Create our imputer to replace missing values with the mean e.g.
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp = imp.fit(X_train)

# Impute our data, then train
# X_train_imp = imp.transform(X_train)

In [None]:
# Find all cols than I am going to drop >= 100
# Drop them
# Run get_dummies
# Impute - Loop through all cols in X, if numerical, use mean, use mode if categorical

In [137]:
# X['CHROM_1'].value_counts()
# X['CHROM_1'].dtype == np.dtype('O')

In [144]:
# Create our imputer to replace missing values with the mean e.g.
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp = imp.fit(X[['ORIGIN']])

# Impute our data, then train
# my_imp = imp.transform(X[['ORIGIN']])

In [150]:
# my_imp[65161]

array([1.27286887])

In [148]:
# list(X)

In [227]:
# X['Consequence_3_prime_UTR_variant']

In [379]:
# len(list(X))
# for col in list(X):
# for index, col in enumerate(list(X)):
#     if col == 'POS':
#         print(X[col].dtype == np.dtype('O'))

#     if X[col].dtype == np.dtype('U'):
#         print(col)

#     print(X[col].dtype == np.unit8)
    

# if X[col].dtype == np.float64 -> imputer(mean), if X[col].dtype == np.float64

# def my_function(d):
#   print("Hello ", list(d))

# HERE
d = X.select_dtypes(include=['float64', 'int64'])
# .apply(my_function)
# df.select_dtypes(exclude=['string','object']).apply(your_other_function)

# IMPUTER

In [380]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(d)

# Impute our data, then train
X[d.columns] = imp.transform(d)

In [367]:
X['MOTIF_SCORE_CHANGE'].value_counts

<bound method IndexOpsMixin.value_counts of 0       -0.080
1       -0.080
2       -0.080
3       -0.080
4       -0.080
5       -0.080
6       -0.080
7       -0.080
8       -0.080
9       -0.080
10      -0.080
11      -0.080
12      -0.080
13      -0.080
14      -0.080
15      -0.080
16      -0.080
17      -0.080
18      -0.080
19      -0.080
20      -0.080
21      -0.080
22      -0.080
23      -0.080
24      -0.080
25      -0.080
26      -0.080
27      -0.080
28      -0.080
29      -0.080
         ...  
65158   -0.080
65159   -0.080
65160   -0.080
65161   -0.080
65162   -0.080
65163   -0.080
65164   -0.080
65165   -0.080
65166   -0.080
65167   -0.080
65168   -0.080
65169   -0.080
65170   -0.080
65171   -0.080
65172   -0.063
65173   -0.080
65174   -0.080
65175   -0.080
65176   -0.080
65177   -0.080
65178   -0.097
65179   -0.080
65180   -0.080
65181   -0.080
65182   -0.080
65183   -0.080
65184   -0.080
65185   -0.080
65186   -0.080
65187   -0.080
Name: MOTIF_SCORE_CHANGE, Length: 65188, 

In [381]:
dd = X.select_dtypes(include=['uint8'])
# X.dtypes

In [382]:
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp = imp.fit(dd)

# Impute our data, then train
X[dd.columns] = imp.transform(dd)

In [322]:
ll = 'AF_ESP'
X[ll].dtype == np.dtype('f')

False

In [326]:
print(np.dtype('c'))

|S1


# Duplicates

In [294]:
X = X.loc[:,~X.columns.duplicated()]

In [303]:
# X['CHROM_16']

In [312]:
X['SIFT_deleterious_low_confidence']

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
65158    0
65159    0
65160    0
65161    0
65162    0
65163    0
65164    0
65165    0
65166    0
65167    0
65168    0
65169    0
65170    0
65171    0
65172    0
65173    0
65174    0
65175    0
65176    0
65177    0
65178    0
65179    0
65180    0
65181    0
65182    0
65183    0
65184    0
65185    0
65186    0
65187    0
Name: SIFT_deleterious_low_confidence, Length: 65188, dtype: uint8

In [336]:
X

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,SSR,DISTANCE,STRAND,MOTIF_POS,MOTIF_SCORE_CHANGE,...,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,PolyPhen_benign,PolyPhen_possibly_damaging,PolyPhen_probably_damaging,PolyPhen_unknown,MOTIF_NAME_Egr1:MA0341.1,MOTIF_NAME_FOXA1:MA0546.1,HIGH_INF_POS_N
0,955563,0.0000,0.00000,0.0000,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
1,955597,0.0000,0.42418,0.2826,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
2,955619,0.0000,0.03475,0.0088,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
3,957640,0.0318,0.02016,0.0328,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
4,976059,0.0000,0.00022,0.0010,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
5,976554,0.0000,0.01494,0.0256,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
6,976563,0.0000,0.00135,0.0098,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
7,976598,0.0000,0.00626,0.0056,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
8,976629,0.0000,0.01004,0.0411,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
9,976963,0.0141,0.00461,0.0126,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
