# Naïve Bayes

In [1]:
import numpy as np
import pandas as pd
import sklearn.neural_network
import sklearn.model_selection
import sklearn.naive_bayes
from sklearn import preprocessing
import scipy
import pprint
from mixed_naive_bayes import MixedNB

## Load data

In [2]:
X = pd.read_csv("inputs_cleaned.tsv", sep="\t", index_col=0)
y = pd.read_csv("targets_cleaned.tsv", sep="\t", index_col=0)
y

Unnamed: 0,"Recurrence status (1, yes; 0, no)","Survial status (1, dead; 0, alive)",histologic_grade,histologic_type,success_last_follow-up,tumor_stage
C3L-00004,0.0,0.0,g3,clear cell renal cell carcinoma,unknown,t3
C3L-00010,0.0,0.0,g3,clear cell renal cell carcinoma,complete remission,t1
C3L-00011,0.0,1.0,g4,clear cell renal cell carcinoma,patient deceased,t3
C3L-00026,0.0,0.0,g3,clear cell renal cell carcinoma,complete remission,t1
C3L-00079,0.0,1.0,g3,clear cell renal cell carcinoma,patient deceased,t3
...,...,...,...,...,...,...
C3N-02582,0.0,1.0,g3,adenocarcinoma,patient deceased,t2
C3N-02586,0.0,1.0,g2,adenocarcinoma,patient deceased,t2
C3N-02587,0.0,0.0,g2,adenocarcinoma,complete remission,t1
C3N-02588,0.0,0.0,g3,adenocarcinoma,complete remission,t2


## Inputs: Cast booleans to ints and one-hot encode categorical features

In [3]:
# Only categorical column is cancer type. We'll one-hot encode it.
cancer_type_one_hot = pd.get_dummies(X["cancer_type"])
X = X.drop(columns="cancer_type")
X = cancer_type_one_hot.join(X)
# Now we just need to cast bools to ints
X.loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]] = X.\
loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]].astype(int)

In [27]:
# split up data into nominal and continuous
X_nominal = X.select_dtypes(include=['int'])
X_continuous = X.select_dtypes(include=['float64'])

# also identify columns of categorical data for mixed naive bayes
categorical_cols = X.columns[X.dtypes == np.uint8]
categorical_idxs = [X.columns.get_loc(c) for c in categorical_cols]

[0, 1, 2, 3, 4]

## Targets: One-hot encode and split into a map of tables so we can do one target at a time

In [12]:
ys = {}
encoders = {}

for col in y.columns:
    data = y[col]
    le = preprocessing.LabelEncoder()
    ys[col] = le.fit_transform(data)
    #print(ys[col].shape)
    encoders[col] = le

# Train Models
### Gaussian Naïve Bayes (with continous data)

In [20]:
def naive_gaussian(X, ys):
    
    results = {}
    for target, y in ys.items():
        nbg = sklearn.naive_bayes.GaussianNB()
        results[target] = sklearn.model_selection.cross_validate(nbg, X, y, cv=10)
        
    return results

In [7]:
gaussian_results = naive_gaussian(X_continuous, ys)
print(gaussian_results)

{'Recurrence status (1, yes; 0, no)': {'fit_time': array([0.00289893, 0.00186014, 0.0016849 , 0.00160575, 0.00194502,
       0.00152397, 0.00150013, 0.00145292, 0.00110722, 0.0011301 ]), 'score_time': array([0.00155115, 0.00135612, 0.00109792, 0.00117612, 0.00141501,
       0.00099611, 0.00107908, 0.00092506, 0.00080991, 0.00078988]), 'test_score': array([0.18181818, 0.18181818, 0.78787879, 0.18181818, 0.15151515,
       0.27272727, 0.1875    , 0.25      , 0.1875    , 0.1875    ])}, 'Survial status (1, dead; 0, alive)': {'fit_time': array([0.00121403, 0.00110221, 0.00117183, 0.00078392, 0.00101185,
       0.00095272, 0.00095296, 0.000911  , 0.00128508, 0.0010469 ]), 'score_time': array([0.00083613, 0.00078893, 0.00075531, 0.00065207, 0.0007031 ,
       0.00069022, 0.00068903, 0.00096703, 0.00088286, 0.00064993]), 'test_score': array([0.27272727, 0.3030303 , 0.60606061, 0.21212121, 0.39393939,
       0.78787879, 0.59375   , 0.4375    , 0.40625   , 0.46875   ])}, 'histologic_grade': {'fi



### Multinomial Naïve Bayes (with nominal data)

In [8]:
def naive_multinomial(X, ys):
    
    results = {}
    for target, y in ys.items():
        nbmn = sklearn.naive_bayes.MultinomialNB()
        results[target] = sklearn.model_selection.cross_validate(nbmn, X, y, cv=10)
        
    return results

In [9]:
MNresults = naive_multinomial(X_nominal, ys)
print(MNresults)

{'Recurrence status (1, yes; 0, no)': {'fit_time': array([0.00579286, 0.00190163, 0.00182605, 0.00152612, 0.00145316,
       0.00145483, 0.00144172, 0.00120401, 0.00122619, 0.00120282]), 'score_time': array([0.00154805, 0.00112724, 0.00118566, 0.00094724, 0.00095201,
       0.00093293, 0.00088024, 0.00078893, 0.00081086, 0.00077701]), 'test_score': array([0.78787879, 0.81818182, 0.81818182, 0.81818182, 0.81818182,
       0.78787879, 0.8125    , 0.8125    , 0.8125    , 0.8125    ])}, 'Survial status (1, dead; 0, alive)': {'fit_time': array([0.00122905, 0.00110483, 0.00106192, 0.00106907, 0.00108409,
       0.00103021, 0.00099897, 0.00096703, 0.00096583, 0.00097394]), 'score_time': array([0.00079107, 0.0007    , 0.000705  , 0.0006988 , 0.00070095,
       0.00064373, 0.00064111, 0.00063992, 0.00063992, 0.00064301]), 'test_score': array([0.81818182, 0.81818182, 0.81818182, 0.81818182, 0.81818182,
       0.81818182, 0.84375   , 0.84375   , 0.84375   , 0.8125    ])}, 'histologic_grade': {'fi



### Mixed Naïve Bayes (with both nominal and continous data)

In [29]:
def naive_mixed(X, ys, cat_features):

    results = {}
    for target, y in ys.items():
        nbmixed = MixedNB(categorical_features=cat_features)
        results[target] = sklearn.model_selection.cross_validate(nbmixed, X, y, cv=10)
        
    return results

In [30]:
mixed_results = naive_mixed(X, ys, cat_features=categorical_idxs)

[2 2 2 2 2]
[2 2 2 2 2]
[2 1 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 1 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 1 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 1 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 1 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]


Traceback (most recent call last):
  File "/Users/corbinday/.pyenv/versions/3.9.5/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/corbinday/.pyenv/versions/3.9.5/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 418, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/Users/corbinday/.pyenv/versions/3.9.5/lib/python3.9/site-packages/mixed_naive_bayes/mixed_naive_bayes.py", line 330, in score
    y_predicted = np.array(self.predict(X))
  File "/Users/corbinday/.pyenv/versions/3.9.5/lib/python3.9/site-packages/mixed_naive_bayes/mixed_naive_bayes.py", line 295, in predict
    probs = self.predict_proba(X, verbose)
  File "/Users/corbinday/.pyenv/versions/3.9.5/lib/python3.9/site-packages/mixed_naive_bayes/mixed_naive_bayes.py", line 259, in predict_proba
    probas = [categorical_posterior[:, X[:, i][:, np.newaxis]]
  File "/Users/corbinday/.pyenv/vers