# Feature Engineering: Presidential Contributions

Let's clean up the presidential contributions dataset by feature engineering.

In [None]:
%matplotlib inline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import isnan, when, count, col, split, trim, countDistinct, abs 
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType

import pyspark.sql.functions

print('Spark UI running on http://YOURIPADDRESS:' + sc.uiWebUrl.split(':')[2])

## Step 1: Load the data

In [None]:
#Load presidential contrib data
dataset = spark.read.csv("/data/presidential_election_contribs/2016/2016-medium-10k.csv", header=True, inferSchema=True)


In [None]:

feature_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'LAT', 'LNG', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
numeric_columns = ['LAT', 'LNG']
categorical_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_index = ['CAND_NM_index', 'FIRSTNAME_index', 'LASTNAME_index', 'CONTBR_ST_index', 'CONTBR_EMPLOYER_index', 
                     "CONTBR_OCCUPATION_index"]
prediction_column = ['CONTB_RECEIPT_AMT']

## Step 2: Check for contribution amounts less than zero

As a cleanup, let us check for contribution amounts less than zero, and fix if necessary.

In [None]:
# Contribution receipt less than zero?
dataset.filter('CONTB_RECEIPT_AMT < 0').show()

In [None]:
dataset = dataset.withColumn('CONTB_RECEIPT_AMT', pyspark.sql.functions.abs(dataset['CONTB_RECEIPT_AMT']))

## Step 3: Split up name into first name and last name

In [None]:
split_col = split(dataset['CONTBR_NM'], ',')
dataset = dataset.withColumn('LASTNAME', trim(split_col.getItem(0)))
dataset = dataset.withColumn('FIRSTNAME', trim(split_col.getItem(1)))


## Step 4: Join to zip code table

The zip code is not a useful feature as it is. It's not a numeric column, although it looks like one.    What we can do is to try to turn it into a numeric feature (LAT/LNG).  One way to do this is to join it to a zip code table to look up lat / long, which are meaningful numeric features.

**=> TODO: Join the main table to the zip code table to get the LAT,LNG fields instead of zip.

In [None]:
dataset = dataset.withColumn('ZIP5DIG', dataset['CONTBR_ZIP'].substr(0,5).cast(IntegerType()))
#Load Zipcode data
zipcodes = spark.read.csv('/data/zipcodes/zipcodes.csv.gz', header=True, inferSchema=True)
joined = ??? #TODO  JOIN Datasets together

In [None]:
joined.show()

## Step 5: Look at some breakdowns by different variables

**=> TODO: Find breakdown/counts by candidate name

In [None]:
# What is the breakdown by candidate name?

joined.??? # Get breakdown by candidate name

**=> TODO: Find breakdown/counts by occupation

Hint: once you do a count(), it will create a new field called "count".. you can call
sort() on this field if you want to.

example joined.groupBy(...count(0).filter("`count` ...").sort('count'....)

In [None]:
# What is the breakdown by occupation?
joined.??? # TODO: Get breakdown

In [None]:
# See Cardinality of categorical features
joined.agg(*(countDistinct(col(c)).alias(c) for c in categorical_columns)).show()


In [None]:
# Look for NAs
joined.select([count(when(isnan(c), c)).alias(c) for c in joined.columns]).show()

## Step 6: Replace NAs with Unknown

We do not want NAs, so let's just replace empty columns with Unknown
**=> TODO: replace NAs with Unknown



In [None]:
donations = joined.select(prediction_column + feature_columns).???
donations.show()

## Step 7: Write out final Results

In [None]:
donations.toPandas().to_csv("election-clean.csv", header=True, index=False)