<a href="https://colab.research.google.com/github/andytorrestb/AIG/blob/master/naive_bayes_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
%%time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score as acc_score
from sklearn.metrics import roc_auc_score as roc_score

from sklearn.preprocessing import OrdinalEncoder

CPU times: user 60 µs, sys: 1 µs, total: 61 µs
Wall time: 67 µs


The following block contains functoins for preparing the data. 

In [None]:
%%time

# Replace labels for columns. Ugly but gets the job done.
def replace_labels(df):
  df.columns = ['age', 'workclass', 'final_weight', 'education', 'education_num', 'marital_status', 'occupation', 'relationship',
             'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', '<=50K']
  return df

# Split features according to data type. Necessary?
def split_column_types(df):
  num_labels = []
  cat_labels = []

  for feature in df.columns:
    if df[feature].dtype == 'int64':
      num_labels.append(feature)
    else :
      cat_labels.append(feature)

  return num_labels, cat_labels

# returns a list containing names of columns cotaining 
# categorical data. 
def make_cat_labels(df):
  cat_labels = []
  for feature in df.columns:
    if df[feature].dtype != 'int64':
      cat_labels.append(feature)

  return cat_labels

# Remove leading and trailing spaces from the df
def strip_df(df, cat_labels):
  for feature in cat_labels:
    df[feature] = df[feature].str.strip()
  return df

# Helper function for remove_null_records
def check_for_null(df, cat_labels):
  missing_list = []

  for feature in cat_labels:
    count_list = df[feature].value_counts().index.tolist()
    if count_list.count('?'):
      missing_list.append(feature)
  
  return missing_list

# Helper function for remove_null_records
def insert_null_values(df, cat_labels):
  missing_list = check_for_null(df, cat_labels)
  for feature in missing_list:
    df[feature].replace('?', np.NaN, inplace = True)
  return df

# Helper function for process_cat_data
def remove_null_records(df, cat_labels):
  return insert_null_values(df, cat_labels).dropna()

# Cleans categorical data by stripping spaces
# removing null records. 
def process_cat_data(df):
  cat_labels = make_cat_labels(df)

  df = strip_df(df, cat_labels)
  df = remove_null_records(df, cat_labels)

  return df

# Changes data type of target vector from string to integer
def convert_target_vector(df):
  df['<=50K'].replace('<=50K', 1, inplace = True)
  df['<=50K'].replace('>50K', 0, inplace = True)

  return df  

CPU times: user 20 µs, sys: 0 ns, total: 20 µs
Wall time: 23.8 µs


The following block contains functions for producing distributions for model metrics. 

In [59]:
%%time

def progress_bar(trial, no_trials):
    if trial != no_trials - 1:
      if trial % (no_trials / 20) == 0:
        print('#', end = '')
    else :
      print('#')

def basic_dist(df, test_pct, no_trials):
  # Do that one thing. 
  df = df.sample(frac = 1).reset_index(drop = True)

  # Instantiate the tranformer
  ec = OrdinalEncoder()

  # Instantiate the model
  gnb = GaussianNB()

  f1_dist = []

  for x in range(no_trials):
    X = df.drop(columns = '<=50K')
    y = df['<=50K']

    X = ec.fit_transform(X)

    # Split into training/testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
  
    # fit the model
    gnb.fit(X_train, y_train)

    # Make prediction
    y_pred = gnb.predict(X_test)

    # Add f1 score to list
    f1_dist.append(f1_score(y_test, y_pred))

    # Print small progress bar.
    progress_bar(x, no_trials)
    
  return f1_dist

# Helper function for type_error()
def make_test_dict():
  df = pd.DataFrame()
  gnb = GaussianNB()
  test_pct = 0.5
  no_trials = 100

  return {'df': df, 'gnb': gnb, 'test_pct': test_pct, 'no_trials': no_trials}

# Simple sanity check. 
def type_error(model):
  test = make_test_dict()

  if type(model) != type(test):
    return 1

  if len(model) == 0:
    return 1
  
  for feature in test.keys():
    if type(test[feature]) != type(model[feature]):
      return 1
  
def model_dist(model):

  if type_error(model):
    print('type error: check definition of "model"')
    return

  # list to store distributions of different metrics.
  f1_dist = []
  acc_dist = []
  roc_dist = []

  # Do that one thing. TODO: look this up
  model['df'] = model['df'].sample(frac = 1).reset_index(drop = True)

  # Instantiate the tranformer
  ec = OrdinalEncoder()
  
  for x in range(model['no_trials']):
    X = model["df"].drop(columns = '<=50K')
    y = model["df"]["<=50K"]

    X = ec.fit_transform(X)

    # Split into training/testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
  
    # fit the model
    gnb.fit(X_train, y_train)

    # Make prediction
    y_pred = gnb.predict(X_test)

    # Update individual metrics
    f1_dist.append(f1_score(y_test, y_pred))
    acc_dist.append(acc_score(y_test, y_pred))
    roc_dist.append(roc_score(y_test, y_pred))

    # Print small progress bar.
    progress_bar(x, model['no_trials'])
  
  print('Success')
  return {'f1': f1_dist, 'acc': acc_dist, 'roc': roc_dist}
  

def metric_dist(df, test_pct, no_trials): 

  # list to store distributions of different metrics.
  f1_dist = []
  acc_dist = []
  roc_dist = []

  # Do that one thing. TODO: look this up
  df = df.sample(frac = 1).reset_index(drop = True)

  # Instantiate the tranformer
  ec = OrdinalEncoder()

  # Instantiate the model
  gnb = GaussianNB()

  for x in range(no_trials):
    X = df.drop(columns = '<=50K')
    y = df['<=50K']

    X = ec.fit_transform(X)

    # Split into training/testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
  
    # fit the model
    gnb.fit(X_train, y_train)

    # Make prediction
    y_pred = gnb.predict(X_test)

    # Update individual metrics
    f1_dist.append(f1_score(y_test, y_pred))
    acc_dist.append(acc_score(y_test, y_pred))
    roc_dist.append(roc_score(y_test, y_pred))

    # Print small progress bar.
    progress_bar(x, no_trials)
  
  metrics = {'f1': f1_dist, 'acc': acc_dist, 'roc': roc_dist}
  return metrics

# TODO: write, and package set of parameters into a dictionary. 
# can probably replace basic_dict()
# def tuned_dist(df, test_pct, no_trials, gnb):


CPU times: user 14 µs, sys: 0 ns, total: 14 µs
Wall time: 18.6 µs


In [60]:
%%time

# Load data. 
path = '/content/drive/My Drive/adult.csv'
train = replace_labels(pd.read_csv(path))

train = process_cat_data(train)
train = convert_target_vector(train)

# sns.histplot(x = basic_dist(train, 0.1, 10))
# metrics = metric_dist(train, 0.1, 10)

gnb = GaussianNB()
test_pct = 0.5
no_trials = 100

model = {'df': train, 'gnb': gnb, 'test_pct': test_pct, 'no_trials': no_trials}

metrics = model_dist(model)

print(type(metrics))

#####################
Success
<class 'dict'>
CPU times: user 21.4 s, sys: 126 ms, total: 21.5 s
Wall time: 21.6 s


Basic model at different test/train splits. 

In [None]:
type(metrics)

dict

Run hyperparamter search

In [None]:
t = {}
type(t)

dict