# Naive Bayes Income Classifier
We are going to use data from US Census to predict income

In [None]:
%matplotlib inline

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print('Spark UI running on http://YOURIPADDRESS:' + sc.uiWebUrl.split(':')[2])

## Step 1 : Load Data

In [None]:
t1 = time.perf_counter()

dataset = spark.read.format("csv").\
          option('header','true').\
          option('inferSchema', 'true').\
          load("/data/census-income/income-cleaned.csv")

t2 = time.perf_counter() 

print("read {:,} records in {:,.2f} ms".format(dataset.count(), (t2-t1)*1000))

dataset.printSchema()
dataset.show()

## Step 2 : Basic Analytics
0 is income <=50k
1 is income > 50k

In [None]:
dataset.groupBy('income-over-50k').count().show()

## Step 3 : Create Feature Vector

### 3.1 Index all categorical columns

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

## TODO : add numerical columns.
##   examine the data and add numerical columns
##   Hint : start with 'age', 'education-num', 'hours-per-week'
numeric_columns = ['age', '???', '???' ]

categorical_columns = ['employment', 'education', 'marital-status', 'occupation', 'race', 'sex', 'native-country']
categorical_index = ['employment_index', 'education_index', 'marital-status_index', 'occupation_index', 
                     "race_index", 'sex_index', 'native-country_index']
input_cols = numeric_columns + categorical_index


indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep").fit(dataset)\
            for column in categorical_columns ]
pipeline = Pipeline(stages=indexers)

t1 = time.perf_counter()
dataset_indexed = pipeline.fit(dataset).transform(dataset)
t2 = time.perf_counter()
dataset_indexed.show()



print("indexed {:,} records in {:,.2f} ms".format(dataset.count(), (t2-t1)*1000))

## Save as CSV for easy viewing in Excel
dataset_indexed.write.\
        option('header', 'true').\
        mode('overwrite').\
        csv('out-indexed')
print("Saved indexed vector to 'out-indexed' directory")

### 3.2 - Create feature Vector

In [None]:
from pyspark.ml.feature import VectorAssembler

## TODO : set inputCols = input_cols
assembler = VectorAssembler(inputCols=???, outputCol="features")
featureVector = assembler.transform(dataset_indexed)
featureVector = featureVector.withColumn("label",featureVector["income-over-50k"])
featureVector.select(['features', 'label']).show()

## Step 4 : Train / Test set

In [None]:
(train, test) = featureVector.randomSplit([0.8, 0.2], seed=1234)

print("training set count : ", train.count())
print("testing set count : ", test.count())

## Step 5 : Create Naive Bayes Model

In [None]:
from pyspark.ml.classification import NaiveBayes

## TODO : Create a NB model with these parameters
##     smoothing = 1.0
##     modelType = 'multinomial'
nb = NaiveBayes(smoothing=???, modelType='???')

# train the model
t1 = time.perf_counter()
model = nb.fit(train)
t2 = time.perf_counter()

print("trained on {:,} records  in {:,.2f} ms".\
      format(train.count(), (t2-t1)*1000))

## Step 6: Run Test Data
Let's call .transform on our model to do make predictions on our test data. The output should be contained in the "prediction" column, while the correct label will be there in the "label" column.

We will be able to evaluate our results by comparing the results.

In [None]:
# select example rows to display.
predictions = model.transform(test)
predictions.select(['label', 'prediction']).\
            sampleBy(col='prediction', fractions={0: 0.5, 1: 0.5}).\
            show()


predictions.groupBy('prediction').count().show()


## Step 7 : Evaluate Model

### 7.1 Test Accuracy

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

### 7.2 Confusion Matrix

In [None]:
predictions.groupBy('label').\
            pivot('prediction', [0,1]).\
            count().na.fill(0).\
            orderBy('label').\
            show()

## Step 8 : Discuss Model Accuracy
Discuss how to improve model accuracy?  Here are some points to consider.

- can you add any more input variables?
- why is the model bad at predicting >50k income category?  Check the original data for skew?