# Genetic Variant Classifications
---

In [56]:
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy import stats
from imblearn.ensemble import EasyEnsemble
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVR
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
%matplotlib inline

## Import the dataset

In [258]:
df = pd.read_csv('./data/clinvar_conflicting.csv')
df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(65188, 46)

In [259]:
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,...,SIFT,PolyPhen,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62
0,1,955563,G,C,0.0,0.0,0.0,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,,,,,,,0.421,11.39,1.133255,-2.0
1,1,955597,G,T,0.0,0.42418,0.2826,MedGen:CN169374,,not_specified,...,,,,,,,0.421,8.15,0.599088,
2,1,955619,G,C,0.0,0.03475,0.0088,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,,,,,,,0.421,3.288,0.069819,1.0
3,1,957640,C,T,0.0318,0.02016,0.0328,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,,,,,,,0.421,12.56,1.356499,
4,1,976059,C,T,0.0,0.00022,0.001,MedGen:CN169374,,not_specified,...,,,,,,,0.421,17.74,2.234711,


### Slightly Imbalanced classes

In [260]:
df['CLASS'].value_counts()

0    48754
1    16434
Name: CLASS, dtype: int64

### Data preperation

In [261]:
X = df.loc[:, ~df.columns.isin(['CLASS'])]

Y = df['CLASS']

In [262]:
# Converting to categorical
convert_cat = []
to_drop = []
unique = None

categorical = X.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    unique = column.nunique()
    print(unique)
    
    # drop cols with too many unique values     
    if unique <= 200:
        convert_cat.append(i)
    else:
        to_drop.append(i)

CHROM
25
REF
866
ALT
458
CLNDISDB
9234
CLNDISDBINCL
48
CLNDN
9260
CLNDNINCL
54
CLNHGVS
65188
CLNSIGINCL
68
CLNVC
7
CLNVI
26289
MC
89
Allele
374
Consequence
48
IMPACT
4
SYMBOL
2328
Feature_type
2
Feature
2369
BIOTYPE
2
EXON
3264
INTRON
1929
cDNA_position
13970
CDS_position
13663
Protein_position
7339
Amino_acids
1262
Codons
2220
BAM_EDIT
2
SIFT
4
PolyPhen
4
MOTIF_NAME
2
HIGH_INF_POS
1


In [263]:
print(f'convert_cat\n------\n{convert_cat} \n')
print(f'to_drop\n------\n{to_drop}')

convert_cat
------
['CHROM', 'CLNDISDBINCL', 'CLNDNINCL', 'CLNSIGINCL', 'CLNVC', 'MC', 'Consequence', 'IMPACT', 'Feature_type', 'BIOTYPE', 'BAM_EDIT', 'SIFT', 'PolyPhen', 'MOTIF_NAME', 'HIGH_INF_POS'] 

to_drop
------
['REF', 'ALT', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNVI', 'Allele', 'SYMBOL', 'Feature', 'EXON', 'INTRON', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons']


### Drop all features with more than 90% NaN's

In [264]:
to_drop_nans = []

for c in convert_cat:
    if X[c].isnull().sum() / X.shape[0] > 0.85:
        print(f'{c}: {X[c].isnull().sum() / X.shape[0]}')
        to_drop_nans.append(c)

CLNDISDBINCL: 0.9988341412529913
CLNDNINCL: 0.9988341412529913
CLNSIGINCL: 0.9988341412529913
MOTIF_NAME: 0.9999693195066577
HIGH_INF_POS: 0.9999693195066577


In [265]:
X = X.loc[:, ~X.columns.isin(to_drop_nans)]

for item in to_drop_nans:
    convert_cat.remove(item)

In [266]:
convert_cat

['CHROM',
 'CLNVC',
 'MC',
 'Consequence',
 'IMPACT',
 'Feature_type',
 'BIOTYPE',
 'BAM_EDIT',
 'SIFT',
 'PolyPhen']

# HERE
-----

In [10]:
# X = X.loc[:,~X.columns.duplicated()]

In [50]:
X = pd.get_dummies(data=X, columns=['SYMBOL']) 

In [52]:
X.loc[df['SYMBOL'].isnull(), X.columns.str.startswith("SYMBOL_")] = np.nan

In [51]:
X.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,...,SYMBOL_ZMPSTE24,SYMBOL_ZMYND11,SYMBOL_ZNF276,SYMBOL_ZNF335,SYMBOL_ZNF41,SYMBOL_ZNF423,SYMBOL_ZNF469,SYMBOL_ZNF674,SYMBOL_ZNF81,SYMBOL_ZSWIM6
0,1,955563,G,C,0.0,0.0,0.0,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
1,1,955597,G,T,0.0,0.42418,0.2826,MedGen:CN169374,,not_specified,...,0,0,0,0,0,0,0,0,0,0
2,1,955619,G,C,0.0,0.03475,0.0088,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
3,1,957640,C,T,0.0318,0.02016,0.0328,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
4,1,976059,C,T,0.0,0.00022,0.001,MedGen:CN169374,,not_specified,...,0,0,0,0,0,0,0,0,0,0


In [33]:
impute_cat = X.loc[:, X.columns.str.startswith("SYMBOL_")]

In [None]:
impute_cat

In [69]:
# # For categorical data, impute using mode values
# categorical_rows = impute_cat
# imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
# imp = imp.fit(categorical_rows)

# # Impute the data
# X[categorical_rows.columns] = imp.transform(categorical_rows)

# HERE
----

### Convert features to categorical - get_dummies

In [267]:
# Deleting duplicate rows
X = X.loc[:,~X.columns.duplicated()]

# Drop cols with too many unique values
X = X.loc[:, ~X.columns.isin(to_drop)]

# Get dummies - conver to categroical
# dummy_na=True
# X = pd.get_dummies(data=X, columns=convert_cat)

# CLNDISDBINCL
X = pd.get_dummies(data=X, columns=convert_cat) 

# X

# X.loc[X['CHROM'].isnull(), X.columns.str.startswith("CHROM_")] = np.nan
# .fillna(np.nan)

In [268]:
X.head()

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,SSR,DISTANCE,STRAND,MOTIF_POS,MOTIF_SCORE_CHANGE,...,BAM_EDIT_FAILED,BAM_EDIT_OK,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,PolyPhen_benign,PolyPhen_possibly_damaging,PolyPhen_probably_damaging,PolyPhen_unknown
0,955563,0.0,0.0,0.0,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
1,955597,0.0,0.42418,0.2826,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
2,955619,0.0,0.03475,0.0088,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
3,957640,0.0318,0.02016,0.0328,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0
4,976059,0.0,0.00022,0.001,1.0,,,1.0,,,...,0,0,0,0,0,0,0,0,0,0


In [270]:
# cols = df.loc[:, df.columns.str.startswith('MC')]

# Return all NaN values
for feature in convert_cat:
    if df[feature].isnull().sum() > 0:
        X.loc[df[feature].isnull(), X.columns.str.startswith(feature + "_")] = np.nan

In [122]:
# X.loc[df['CLNDISDBINCL'].isnull(), X.columns.str.startswith("CLNDISDBINCL_")] = np.nan

In [150]:
# lol = X.loc[:, X.columns.str.startswith("INTRON_")]

In [213]:
# X.loc[:, X.columns.str.startswith("CHROM_")]

In [None]:
# lol.columns

In [159]:
# X['CLNDISDBINCL'].isnull().sum()

65112

In [144]:
X

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,...,INTRON_96/105,INTRON_96/115,INTRON_96/182,INTRON_96/362,INTRON_97/105,INTRON_97/362,INTRON_98/105,INTRON_98/362,INTRON_99/115,INTRON_99/362
0,1,955563,G,C,0.0000,0.00000,0.0000,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
1,1,955597,G,T,0.0000,0.42418,0.2826,MedGen:CN169374,,not_specified,...,0,0,0,0,0,0,0,0,0,0
2,1,955619,G,C,0.0000,0.03475,0.0088,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
3,1,957640,C,T,0.0318,0.02016,0.0328,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
4,1,976059,C,T,0.0000,0.00022,0.0010,MedGen:CN169374,,not_specified,...,0,0,0,0,0,0,0,0,0,0
5,1,976554,C,G,0.0000,0.01494,0.0256,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
6,1,976563,C,T,0.0000,0.00135,0.0098,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
7,1,976598,C,T,0.0000,0.00626,0.0056,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
8,1,976629,C,T,0.0000,0.01004,0.0411,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0
9,1,976963,A,G,0.0141,0.00461,0.0126,"MedGen:C3808739,OMIM:615120|MedGen:CN169374",,"Myasthenic_syndrome,_congenital,_8|not_specified",...,0,0,0,0,0,0,0,0,0,0


In [214]:
cols_with_nan = pd.isnull(X).sum() > 0
print(cols_with_nan)

POS                                                      False
AF_ESP                                                   False
AF_EXAC                                                  False
AF_TGP                                                   False
ORIGIN                                                    True
SSR                                                       True
CLASS                                                    False
DISTANCE                                                  True
STRAND                                                    True
MOTIF_POS                                                 True
MOTIF_SCORE_CHANGE                                        True
LoFtool                                                   True
CADD_PHRED                                                True
CADD_RAW                                                  True
BLOSUM62                                                  True
CHROM_1                                                

In [31]:
X.dtypes

POS                                        int64
REF                                       object
ALT                                       object
AF_ESP                                   float64
AF_EXAC                                  float64
AF_TGP                                   float64
CLNDISDB                                  object
CLNDISDBINCL                              object
CLNDN                                     object
CLNDNINCL                                 object
CLNSIGINCL                                object
CLNVC                                     object
MC                                        object
ORIGIN                                   float64
SSR                                      float64
CLASS                                      int64
Allele                                    object
Consequence                               object
IMPACT                                    object
SYMBOL                                    object
Feature_type        

### Finding all NaN rows

In [271]:
nans = lambda X: X[X.isnull().any(axis=1)]
len(nans(X))

65188

Every row in the dataset has a null attribute.  Imputing data wiil be a necessity here.

### Imputing data

In [275]:
# numerical_features = pd.DataFrame(X.loc[:, ~X.columns.isin(X.loc[:, X.columns.str.startswith('MC' + "_")])]).append(X.loc[:, ~X.columns.isin(X.loc[:, X.columns.str.startswith('Consequence' + "_")])], ignore_index=True)
cat_feat = []

# cat_feat.append(X.loc[:, X.columns.str.startswith('MC' + "_")])
# cat_feat.append(X.loc[:, X.columns.str.startswith('CHROM' + "_")])

for feature in convert_cat:
#     print(feature)
    for col in X.loc[:, X.columns.str.startswith(feature + "_")].columns:
        cat_feat.append(col)
#     cat_feat.append(X.loc[:, X.columns.str.startswith(feature + "_")])
#     print(X.loc[:, ~X.columns.isin(cat_feat)])
#         numerical_features.append(X.loc[:, ~X.columns.isin(cat_feat)], ignore_index=True)
    
# lol = X.loc[:, X.columns.str.startswith("INTRON_")]
# test_rows = df.loc[:, ~df.columns.isin(lol)]

In [276]:
# X[cat_feat]
numerical_data = X.loc[:, ~X.columns.isin(X[cat_feat])]

In [278]:
numerical_data.columns

Index(['POS', 'AF_ESP', 'AF_EXAC', 'AF_TGP', 'ORIGIN', 'SSR', 'DISTANCE',
       'STRAND', 'MOTIF_POS', 'MOTIF_SCORE_CHANGE', 'LoFtool', 'CADD_PHRED',
       'CADD_RAW', 'BLOSUM62'],
      dtype='object')

# PLOT DISTROS TO SHOW WHY I USED MEDIAN

In [None]:
# PLOT VARIABLE DISTRIBUTIONS - if normal then use mean, otherwise use MEDIAN

In [281]:
# For numerical data, impute using mean values
# numerical_rows = X.select_dtypes(include=['float64', 'int64'])
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp = imp.fit(numerical_data)

# Impute our data
X[numerical_data.columns] = imp.transform(numerical_data)

In [273]:
for feature in convert_cat:
    if df[feature].isnull().sum() > 0:
        dummy_features = X.loc[:, X.columns.str.startswith(feature + "_")]
        
        # For categorical data, impute using mode values
        imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
        imp = imp.fit(dummy_features)

        # Impute the data
        X[dummy_features.columns] = imp.transform(dummy_features)

# # For categorical data, impute using mode values
# categorical_rows = X.select_dtypes(include=['uint8'])
# imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
# imp = imp.fit(categorical_rows)

# # Impute the data
# X[categorical_rows.columns] = imp.transform(categorical_rows)

In [274]:
# Ranodm dummy variable with NaN values
X['MC_SO:0001574|splice_acceptor_variant'].isnull().sum()

0

In [None]:
# AFTER IMPUTING - Ranodm dummy variable with NaN values
X['MC_SO:0001574|splice_acceptor_variant'].isnull().sum()

In [282]:
nans = lambda X: X[X.isnull().any(axis=1)]
len(nans(X))

0

Now we have zero rows with NaN values.

### VarianceThreshold

In [283]:
# Removes all low-variance features
def variance_threshold_selector(data, threshold=0.95):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

vt_to_drop = variance_threshold_selector(X)

vt_to_drop.head()

Unnamed: 0,POS,ORIGIN,DISTANCE,STRAND,CADD_PHRED,CADD_RAW,BLOSUM62
0,955563.0,1.0,469.0,1.0,11.39,1.133255,-2.0
1,955597.0,1.0,469.0,1.0,8.15,0.599088,-1.0
2,955619.0,1.0,469.0,1.0,3.288,0.069819,1.0
3,957640.0,1.0,469.0,1.0,12.56,1.356499,-1.0
4,976059.0,1.0,469.0,1.0,17.74,2.234711,-1.0


In [284]:
X = X.loc[:, ~X.columns.isin(vt_to_drop)]

### Display correlation Matrix to identify features that need to be dropped

In [41]:
correlation_matrix = X.corr()
display(correlation_matrix)

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,SSR,CLASS,DISTANCE,STRAND,MOTIF_POS,MOTIF_SCORE_CHANGE,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62
POS,1.0,0.007003,0.007721,0.004225,0.010502,0.209515,0.012061,0.197942,-0.122896,,-1.0,0.257226,-0.005409,-0.009753,0.022451
AF_ESP,0.007003,1.0,0.85187,0.807741,-0.010724,-0.082302,-0.126538,0.295624,-0.011551,,,0.027343,-0.164169,-0.148118,0.039367
AF_EXAC,0.007721,0.85187,1.0,0.805634,-0.010286,0.098809,-0.121213,0.29184,-0.013358,,,0.027831,-0.154891,-0.140803,0.038996
AF_TGP,0.004225,0.807741,0.805634,1.0,-0.011208,-0.12211,-0.130077,0.040676,-0.014002,,,0.029447,-0.167057,-0.150989,0.043167
ORIGIN,0.010502,-0.010724,-0.010286,-0.011208,1.0,0.009383,0.00292,,-0.009731,,,-0.023043,0.051741,0.057404,-0.00092
SSR,0.209515,-0.082302,0.098809,-0.12211,0.009383,1.0,0.091792,,0.121555,,,0.180075,0.08985,0.070648,0.016879
CLASS,0.012061,-0.126538,-0.121213,-0.130077,0.00292,0.091792,1.0,0.036865,-0.031369,,,4.5e-05,-0.037713,-0.053485,0.001593
DISTANCE,0.197942,0.295624,0.29184,0.040676,,,0.036865,1.0,0.067786,,,0.037794,-0.060623,-0.030079,
STRAND,-0.122896,-0.011551,-0.013358,-0.014002,-0.009731,0.121555,-0.031369,0.067786,1.0,,-1.0,-0.16017,0.014463,0.009622,0.015452
MOTIF_POS,,,,,,,,,,,,,,,


In [42]:
# Correlated features to be dropped
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
print(to_drop)

['CADD_PHRED', 'CADD_RAW']


In [31]:
cols = list(X.columns)

for col in to_drop: 
    cols.remove(col)

len(cols)

X = X[cols]

## Train / Test splits

In [32]:
# Train splits
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.40, random_state=42, stratify=Y)

In [33]:
# Dev and Test splits
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

## Random Forest

In [34]:
rfc = ensemble.RandomForestClassifier(n_estimators=200, class_weight='balanced', n_jobs=-1)

In [35]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

## Evaluation

In [36]:
# Train accuracy
print(f'Train Score: {rfc.score(X_train, y_train)}')
# Dev accuracy
print(f'Dev Score: {rfc.score(X_dev, y_dev)}')

Train Score: 0.9514471262016773
Dev Score: 0.7472771897530296


In [37]:
y_pred_train = rfc.predict_proba(X_train)[:,1]
auc_train = roc_auc_score(y_train, y_pred_train)
print('AUC train', auc_train)

AUC train 0.9849853126320102


In [38]:
y_pred_dev = rfc.predict_proba(X_dev)[:,1]
auc_dev = roc_auc_score(y_dev, y_pred_dev)
print('AUC dev', auc_dev)

AUC dev 0.7483362498341343


In [39]:
dev_pred = rfc.predict(X_dev)
dev_table = pd.crosstab(y_dev, dev_pred, margins=True)
print(pd.crosstab(y_dev, dev_pred))

col_0     0     1
CLASS            
0      8485  1266
1      2029  1258


In [40]:
rf_acc = cross_val_score(rfc, X_train, y_train, cv=5)

In [44]:
print(f'Cross Val: {rf_acc}')
print(f'Average: {np.mean(rf_acc)}')
# Lower bound STD
print('Lower bound STD: ', np.round(rf_acc.mean() - 2 * rf_acc.std(), 2))
# Uppder bound STD
print('Upper bound STD: ', np.round(rf_acc.mean() + 2 * rf_acc.std(), 2))

Cross Val: [0.74306532 0.75265244 0.75031961 0.74558936 0.74405523]
Average: 0.7471363917730673
Lower bound STD:  0.74
Upper bound STD:  0.75


## Feature selection

In [1]:
# estimator = SVR(kernel="linear")
# selector = RFE(estimator, step=1)
# selector = selector.fit(X_train, y_train)

## Gradient Boosting

In [332]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 700,
          'max_depth': 3,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

predict_train = clf.predict(X_train)
predict_dev = clf.predict(X_dev)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_dev = pd.crosstab(y_dev, predict_dev, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

dev_tI_errors = table_dev.loc[0.0,1.0]/table_dev.loc['All','All']
dev_tII_errors = table_dev.loc[1.0,0.0]/table_dev.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(train_tI_errors, train_tII_errors, dev_tI_errors, dev_tII_errors))

Training set accuracy:
Percent Type I errors: 0.041905297606872574
Percent Type II errors: 0.1760840662712211

Test set accuracy:
Percent Type I errors: 0.04701641356036202
Percent Type II errors: 0.18768215984046632


In [333]:
clf.score(X_train, y_train)

0.7820106361219064

In [334]:
clf.score(X_dev, y_dev)

0.7653014265991717

## Logistic Regression

In [48]:
lr = LogisticRegression(C=1e5)

# Fit the model.
fit = lr.fit(X_train, y_train)

pred_y_sklearn = lr.predict(X_dev)

p_sklearn = np.where(pred_y_sklearn < .5, 0, 1)

print('\n Accuracy by admission status')
print(pd.crosstab(y_dev, p_sklearn))

print('\n Percentage accuracy Train')
print(lr.score(X_train, y_train))

print('\n Percentage accuracy')
print(lr.score(X_dev, y_dev))


 Accuracy by admission status
col_0     0   1
CLASS          
0      9714  37
1      3253  34

 Percentage accuracy Train
0.7492329719779096

 Percentage accuracy
0.7476606841540113


## Imbalanced

In [335]:
ee = EasyEnsemble(random_state=0)
X_resampled, y_resampled = ee.fit_sample(X_train, y_train)
print(X_resampled.shape)

rfc_2 = ensemble.RandomForestClassifier(n_estimators=200, class_weight='balanced', max_features='log2', n_jobs=-1)

for x, y in zip(X_resampled, y_resampled):
    rfc_2.fit(x, y)

(10, 19720, 261)


In [336]:
# Train accuracy
print(f'Train Score: {rfc_2.score(X_train, y_train)}')
# Dev accuracy
print(f'Dev Score: {rfc_2.score(X_dev, y_dev)}')

Train Score: 0.8107997545510329
Dev Score: 0.6818530449455438


In [21]:
# X.dtypes

In [20]:
# X[numerical_rows.columns]

In [46]:
# list(X.columns)