# Naïve Bayes

In [19]:
import numpy as np
import pandas as pd
import sklearn.neural_network
import sklearn.model_selection
import sklearn.naive_bayes
from sklearn import preprocessing
import scipy
import pprint
from mixed_naive_bayes import MixedNB

## Load data

In [15]:
X = pd.read_csv("inputs_cleaned.tsv", sep="\t", index_col=0)
y = pd.read_csv("targets_cleaned.tsv", sep="\t", index_col=0)
y

Unnamed: 0,"Recurrence status (1, yes; 0, no)","Survial status (1, dead; 0, alive)",histologic_grade,histologic_type,measure_of_success_of_outcome_at_last_available_follow-up,pathologic_staging_primary_tumor
C3L-00004,0.0,0.0,g3,clear cell renal cell carcinoma,unknown,t3
C3L-00010,0.0,0.0,g3,clear cell renal cell carcinoma,complete remission,t1
C3L-00011,0.0,1.0,g4,clear cell renal cell carcinoma,patient deceased,t3
C3L-00026,0.0,0.0,g3,clear cell renal cell carcinoma,complete remission,t1
C3L-00079,0.0,1.0,g3,clear cell renal cell carcinoma,patient deceased,t3
...,...,...,...,...,...,...
C3N-02582,0.0,1.0,g3,adenocarcinoma,patient deceased,t2
C3N-02586,0.0,1.0,g2,adenocarcinoma,patient deceased,t2
C3N-02587,0.0,0.0,g2,adenocarcinoma,complete remission,t1
C3N-02588,0.0,0.0,g3,adenocarcinoma,complete remission,t2


## Inputs: Cast booleans to ints and one-hot encode categorical features

In [8]:
# Only categorical column is cancer type. We'll one-hot encode it.
cancer_type_one_hot = pd.get_dummies(X["cancer_type"])
X = X.drop(columns="cancer_type")
X = cancer_type_one_hot.join(X)
# Now we just need to cast bools to ints
X.loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]] = X.\
loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]].astype(int)

In [9]:
# split up data into nominal and continuous
X_nominal = X.select_dtypes(include=['int'])
X_continuous = X.select_dtypes(include=['float64'])

# also identify columns of categorical data for mixed naive bayes
categorical = X.columns[X.dtypes == np.uint8]

## Targets: One-hot encode and split into a map of tables so we can do one target at a time

In [24]:
ys = {}
encoders = {}

for col in y.columns:
    data = y[col]
    le = preprocessing.LabelEncoder()
    ys[col] = le.fit_transform(data)
    #print(ys[col].shape)
    encoders[col] = le

(326,)
(326,)
(326,)
(326,)
(326,)
(326,)


## Gaussian Naive Bayes
For continous data 

In [32]:
def naive_gaussian(X, ys):
    
    results = {}
    for target, y in ys.items():
        nbg = sklearn.naive_bayes.GaussianNB()
        #results[target] = 
        results[target] = sklearn.model_selection.cross_validate(nbg, X, y, cv=10)
        
    return results

In [34]:
gaussian_results = naive_gaussian(X_continuous, ys)
print(gaussian_results)

{'Recurrence status (1, yes; 0, no)': {'fit_time': array([0.00265503, 0.0018909 , 0.00170088, 0.00140309, 0.00130105,
       0.00156474, 0.00133705, 0.00123334, 0.00116777, 0.00100899]), 'score_time': array([0.00117397, 0.00157905, 0.00103593, 0.00098491, 0.00086904,
       0.00088239, 0.00088501, 0.00085282, 0.00073433, 0.00072908]), 'test_score': array([0.18181818, 0.18181818, 0.78787879, 0.18181818, 0.15151515,
       0.27272727, 0.1875    , 0.25      , 0.1875    , 0.1875    ])}, 'Survial status (1, dead; 0, alive)': {'fit_time': array([0.00105619, 0.0010128 , 0.00103998, 0.00096369, 0.00100183,
       0.00091314, 0.00089598, 0.00089121, 0.00086784, 0.00081205]), 'score_time': array([0.00073981, 0.00074029, 0.00067091, 0.00066924, 0.00066495,
       0.00065184, 0.00064278, 0.00064421, 0.00058317, 0.0005908 ]), 'test_score': array([0.27272727, 0.3030303 , 0.60606061, 0.21212121, 0.39393939,
       0.78787879, 0.59375   , 0.4375    , 0.40625   , 0.46875   ])}, 'histologic_grade': {'fi

