# Random Forests: Presidential Contributions

Let's look at a random forests models for the presidential dataset.

We are going to try to predict two variables:

1. Amount of contribution (regression)
2. Candidate of Contribution (classification).

In [None]:
%matplotlib inline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import isnan, when, count, col, split, trim, countDistinct, abs 
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType

import pyspark.sql.functions

In [None]:
dataset = spark.read.csv("/data/presidential_election_contribs/2016/2016-medium-clean.csv", header=True, inferSchema=True)

In [None]:
prediction_column = ['CAND_NM']
numeric_columns = ['LAT', 'LNG']
feature_columns = ['LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'LAT', 'LNG', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_columns = ['LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_index = ['FIRSTNAME_index', 'LASTNAME_index', 'CONTBR_ST_index', 'CONTBR_EMPLOYER_index', 
                     "CONTBR_OCCUPATION_index"]

## Classification

Let's try classifying the candidate donated to


In [None]:
dataset.groupBy('CAND_NM').count().show(40)

In [None]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep").fit(dataset) for column in categorical_columns ]
pipeline = Pipeline(stages=indexers)
df_r2 = pipeline.fit(dataset).transform(dataset)

In [None]:
assembler2 = VectorAssembler(inputCols=numeric_columns + categorical_index_donation, outputCol="features")
fv2 = assembler2.transform(df_r2.na.drop())

fv2 = fv2.withColumn("label",fv2.CAND_NM)


# Split the data into training and test sets (30% held out for testing)
(trainingData2, testData2) = fv2.randomSplit([0.7, 0.3])

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(fv2)


# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer2 =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(fv2)


# Train a RandomForest model.
rf2 = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=20, maxBins=10000)

# Chain indexer and forest in a Pipeline
pipeline2 = Pipeline(stages=[labelIndexer, featureIndexer2, rf2])

In [None]:
# Train model.  This also runs the indexers.


model2 = pipeline2.fit(trainingData2)

# Make predictions.
predictions2 = model2.transform(testData2)

In [None]:
predictions2.select(feature_columns + ['probability', 'prediction']).show()

In [None]:
# Select example rows to display.
predictions2.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator2 = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator2.evaluate(predictions2)
print("Test Error = %g" % (1.0 - accuracy))

rfModel2 = model2.stages[2]
print(rfModel2)  # summary only

### Decoding the Label

0. Hillary Clinton
1. Bernie Sanders
2. Donald Trump
3. Ted Cruz

### Confusion Matrix


In [None]:
predictions2.groupBy('label').pivot('prediction', range(0,22)).count().na.fill(0).orderBy('label').show()

In [None]:
rfModel2.featureImportances

In [None]:
print(numeric_columns + categorical_columns_donation)

## Most important Fields
1. Employer
2. Occupation
3. LastName
4. State

Other fields not significant