# Feature Engineering: Presidential Contributions

Let's clean up the presidential contributions dataset by feature engineering.

In [None]:
%matplotlib inline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import isnan, when, count, col, split, trim, countDistinct, abs 
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType

import pyspark.sql.functions



In [None]:
#Load Zipcode data
zipcodes = spark.read.csv('/data/zipcodes/zipcodes.csv.gz', header=True, inferSchema=True)

In [None]:
#Load presidential contrib data
dataset = spark.read.csv("/data/presidential_election_contribs/2016/2016-medium-10k.csv", header=True, inferSchema=True)


In [None]:
zipcodes.show()

In [None]:

feature_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'LAT', 'LNG', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
numeric_columns = ['LAT', 'LNG']
categorical_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_index = ['CAND_NM_index', 'FIRSTNAME_index', 'LASTNAME_index', 'CONTBR_ST_index', 'CONTBR_EMPLOYER_index', 
                     "CONTBR_OCCUPATION_index"]
prediction_column = ['CONTB_RECEIPT_AMT']
prediction_column_donation = ['CAND_NM']
feature_columns_donation = ['LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'LAT', 'LNG', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_columns_donation = ['LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_index_donation = ['FIRSTNAME_index', 'LASTNAME_index', 'CONTBR_ST_index', 'CONTBR_EMPLOYER_index', 
                     "CONTBR_OCCUPATION_index"]

In [None]:
# Contribution receipt less than zero?
dataset.filter('CONTB_RECEIPT_AMT < 0').show()

In [None]:
split_col = split(dataset['CONTBR_NM'], ',')
dataset = dataset.withColumn('LASTNAME', trim(split_col.getItem(0)))
dataset = dataset.withColumn('FIRSTNAME', trim(split_col.getItem(1)))
dataset = dataset.withColumn('ZIP5DIG', dataset['CONTBR_ZIP'].substr(0,5).cast(IntegerType()))
dataset = dataset.withColumn('CONTB_RECEIPT_AMT', pyspark.sql.functions.abs(dataset['CONTB_RECEIPT_AMT']))

In [None]:
joined = dataset.join(zipcodes, dataset.ZIP5DIG == zipcodes.ZIP)

In [None]:
joined.show()

In [None]:
# What is the breakdown by candidate name?

joined.groupBy('CAND_NM').count().show(40)

In [None]:
# What is the breakdown by occupation?

joined.groupBy('CONTBR_OCCUPATION').count().filter("`count` >= 10").sort('count', ascending=False).show()

In [None]:
# See Cardinality of categorical features
joined.agg(*(countDistinct(col(c)).alias(c) for c in categorical_columns)).show()


In [None]:
# Look for NAs
joined.select([count(when(isnan(c), c)).alias(c) for c in joined.columns]).show()

In [None]:
donations = joined.select(prediction_column + feature_columns).na.fill('Unknown')
donations.show()

In [None]:
donations.toPandas().to_csv("election-clean.csv", header=True, index=False)