# Naïve Bayes

In [3]:
import altair as alt
import numpy as np
import pandas as pd
import sklearn.neural_network
import sklearn.model_selection
import sklearn.naive_bayes
from sklearn import preprocessing
import scipy
import pprint
from mixed_naive_bayes import MixedNB

## Load data

In [5]:
cancer_types = [
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
]

In [6]:
inputs = {}
targets = {}
multiclass = {}
binary = {}

for cancer in cancer_types:
    inputs[cancer] = pd.read_csv(f'clean_data/{cancer}_inputs.tsv', sep="\t", index_col=0)
    targets[cancer] = pd.read_csv(f'clean_data/{cancer}_targets.tsv', sep="\t", index_col=0)
    multiclass[cancer] = targets[cancer].drop(columns = ['recurrence_status', 'survival_status'])
    binary[cancer] = targets[cancer].drop(columns = ['histologic_grade', 'histologic_type', 'success_last_follow-up', 'tumor_stage'])

## Targets: Encode and split into a map of tables so we can do one target at a time

In [7]:
target_cols = {}
for cancer in cancer_types:
    encoders = {}

    # multiclass
    ys_multi = {}
    for col in multiclass[cancer].columns:
        data = multiclass[cancer][col]
        le = preprocessing.LabelEncoder()
        ys_multi[col] = le.fit_transform(data)
        #print(col, np.unique(ys_multi[col]))
        encoders[col] = le

    # binary
    ys_binary = {}
    for col in binary[cancer].columns:
        data = binary[cancer][col]
        le = preprocessing.LabelEncoder()
        ys_binary[col] = le.fit_transform(data)
        #print(col, np.unique(ys_binary[col]))
        encoders[col] = le

    ys_all = ys_multi | ys_binary
    
    target_cols[cancer] = {'ys_multi':ys_multi, 'ys_binary': ys_binary, 'ys_all': ys_all,}
    target_cols[cancer]['ys_multi'] = ys_multi
    target_cols[cancer]['ys_binary'] = ys_binary
    target_cols[cancer]['ys_all'] = ys_all
    target_cols[cancer]['encoders'] = encoders

KeyError: 'ccrcc'

# Train Models on Naïve Bayes

In [118]:
def naive_bayes(X, ys, model, avg='samples'):
    
    results = {}
    for target, y in ys.items():
        results[target] = sklearn.model_selection.cross_validate(
            model,
            X,
            y,
            cv=10,
            scoring={
                "accuracy": "accuracy",
                "precision": sklearn.metrics.make_scorer(
                    sklearn.metrics.precision_score,
                    average=avg,
                    zero_division=0
                ),
                "recall": sklearn.metrics.make_scorer(
                    sklearn.metrics.recall_score,
                    average=avg,
                    zero_division=0
                ),
            },
            n_jobs=-1,
            error_score="raise",
        )

    print(results)

    scores = pd.DataFrame(results).\
    T[["test_accuracy", "test_precision", "test_recall"]].\
    applymap(np.mean).T
    
    scores.index.name = "metric"
    chart_df = scores.reset_index().melt(
        id_vars="metric",
        var_name="target",
        value_name="score",
    )
    
    chart = alt.Chart(chart_df).mark_bar().encode(
        x="target",
        y=alt.Y(
            "score",
            scale=alt.Scale(
                domain=[0, 1]
            )
        ),
        color="target",
        column="metric"
    )
    
    scores.columns.name = "target"
    scores.index.name = None
    scores = scores.T.sort_index()
        
    return scores, chart

### Gaussian Naïve Bayes (with continous data)

In [108]:
nbg = sklearn.naive_bayes.GaussianNB()

In [129]:
multi_gaussian_scores, multi_gaussian_chart = naive_bayes(X_continuous, ys_all, nbg, avg='micro')
multi_gaussian_scores.to_latex()



{'histologic_grade': {'fit_time': array([0.0018189 , 0.001333  , 0.00128508, 0.00138116, 0.00129104,
       0.00181985, 0.00148296, 0.00356913, 0.00127387, 0.00128198]), 'score_time': array([0.00475788, 0.00139284, 0.00135112, 0.00132298, 0.0029788 ,
       0.001791  , 0.00143099, 0.00213981, 0.00127578, 0.00127482]), 'test_accuracy': array([0.03030303, 0.09090909, 0.48484848, 0.3030303 , 0.60606061,
       0.51515152, 0.4375    , 0.3125    , 0.25      , 0.46875   ]), 'test_precision': array([0.03030303, 0.09090909, 0.48484848, 0.3030303 , 0.60606061,
       0.51515152, 0.4375    , 0.3125    , 0.25      , 0.46875   ]), 'test_recall': array([0.03030303, 0.09090909, 0.48484848, 0.3030303 , 0.60606061,
       0.51515152, 0.4375    , 0.3125    , 0.25      , 0.46875   ])}, 'histologic_type': {'fit_time': array([0.00152302, 0.00147104, 0.00145411, 0.00137615, 0.00137401,
       0.00222516, 0.00149703, 0.00178599, 0.00150323, 0.00139713]), 'score_time': array([0.00148296, 0.00148511, 0.001455



'\\begin{tabular}{lrrr}\n\\toprule\n{} &  test\\_accuracy &  test\\_precision &  test\\_recall \\\\\ntarget                 &                &                 &              \\\\\n\\midrule\nhistologic\\_grade       &       0.349905 &        0.349905 &     0.349905 \\\\\nhistologic\\_type        &       0.477557 &        0.477557 &     0.477557 \\\\\nrecurrence\\_status      &       0.257008 &        0.257008 &     0.257008 \\\\\nsuccess\\_last\\_follow-up &       0.143466 &        0.143466 &     0.143466 \\\\\nsurvival\\_status        &       0.448201 &        0.448201 &     0.448201 \\\\\ntumor\\_stage            &       0.399716 &        0.399716 &     0.399716 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [122]:
multi_gaussian_chart

In [120]:
binary_gaussian_scores, binary_gaussian_chart = naive_bayes(X_continuous, ys_binary, nbg, 'micro')
binary_gaussian_scores

{'recurrence_status': {'fit_time': array([0.00757217, 0.00820398, 0.00404406, 0.01669407, 0.01579213,
       0.01568794, 0.00184584, 0.001652  , 0.00125694, 0.00115609]), 'score_time': array([0.00220394, 0.00240278, 0.00570965, 0.00160384, 0.00562572,
       0.01322007, 0.00267816, 0.00329494, 0.00130606, 0.00133586]), 'test_accuracy': array([0.18181818, 0.18181818, 0.78787879, 0.18181818, 0.15151515,
       0.27272727, 0.1875    , 0.25      , 0.1875    , 0.1875    ]), 'test_precision': array([0.18181818, 0.18181818, 0.78787879, 0.18181818, 0.15151515,
       0.27272727, 0.1875    , 0.25      , 0.1875    , 0.1875    ]), 'test_recall': array([0.18181818, 0.18181818, 0.78787879, 0.18181818, 0.15151515,
       0.27272727, 0.1875    , 0.25      , 0.1875    , 0.1875    ])}, 'survival_status': {'fit_time': array([0.00735283, 0.00126481, 0.00180101, 0.00115395, 0.00204611,
       0.00156498, 0.00483727, 0.00133014, 0.00166607, 0.00120282]), 'score_time': array([0.00184321, 0.00132203, 0.00116

Unnamed: 0_level_0,test_accuracy,test_precision,test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
recurrence_status,0.257008,0.257008,0.257008
survival_status,0.448201,0.448201,0.448201


In [None]:
binary_gaussian_chart

### Multinomial Naïve Bayes (with nominal data)

In [None]:
def naive_multinomial(X, ys):
    
    results = {}
    for target, y in ys.items():
        nbmn = sklearn.naive_bayes.MultinomialNB()
        results[target] = sklearn.model_selection.cross_validate(nbmn, X, y, cv=10)
        
    return results

In [127]:
nbmn = sklearn.naive_bayes.MultinomialNB()
mn_scores, mn_chart = naive_bayes(X_nominal, ys_all, nbmn, avg='micro')

{'histologic_grade': {'fit_time': array([0.0046401 , 0.00653505, 0.00584579, 0.01194715, 0.01501179,
       0.00903988, 0.01195717, 0.01613617, 0.00230217, 0.00654793]), 'score_time': array([0.00213099, 0.00191808, 0.00196815, 0.00391388, 0.00368881,
       0.0068481 , 0.00177503, 0.00198483, 0.00128698, 0.00165892]), 'test_accuracy': array([0.33333333, 0.27272727, 0.45454545, 0.48484848, 0.51515152,
       0.54545455, 0.53125   , 0.4375    , 0.40625   , 0.53125   ]), 'test_precision': array([0.33333333, 0.27272727, 0.45454545, 0.48484848, 0.51515152,
       0.54545455, 0.53125   , 0.4375    , 0.40625   , 0.53125   ]), 'test_recall': array([0.33333333, 0.27272727, 0.45454545, 0.48484848, 0.51515152,
       0.54545455, 0.53125   , 0.4375    , 0.40625   , 0.53125   ])}, 'histologic_type': {'fit_time': array([0.00529385, 0.00434303, 0.00269818, 0.00167918, 0.00311804,
       0.00252175, 0.00299001, 0.00149202, 0.00274324, 0.00164008]), 'score_time': array([0.004493  , 0.00137591, 0.001494



In [128]:
mn_chart

### Mixed Naïve Bayes (with both nominal and continous data)

In [123]:
def naive_mixed(X, ys, cat_features):

    results = {}
    for target, y in ys.items():
        nbmixed = MixedNB(categorical_features=cat_features)
        nbmixed.fit(X[:int(.75 * len(X))], y[:int(.75 * len(X))])
        score = nbmixed.score(X[int(.75 * len(X)):], y[int(.75 * len(X)):])
        #results[target] = score
        results[target] = sklearn.model_selection.cross_validate(nbmixed, X, y)
        
    return results

In [None]:
mixed_results = naive_mixed(X, ys_multi, cat_features=categorical_idxs)