# Random Forests: Presidential Contributions

Let's look at a random forests models for the presidential dataset.

We are going to try to predict two variables:

1. Amount of contribution (regression)
2. Candidate of Contribution (classification).

In [None]:
# initialize Spark Session
import os
import sys
top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
if top_dir not in sys.path:
    sys.path.append(top_dir)

from init_spark import init_spark
spark = init_spark()
spark

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import isnan, when, count, col, split, trim, countDistinct, abs 
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType

import pyspark.sql.functions

In [None]:
dataset = spark.read.csv("/data/presidential_election_contribs/2016/2016-medium-clean.csv", header=True, inferSchema=True)


In [None]:
dataset.show()

In [None]:

feature_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'LAT', 'LNG', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
numeric_columns = ['LAT', 'LNG']
categorical_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_index = ['CAND_NM_index', 'FIRSTNAME_index', 'LASTNAME_index', 'CONTBR_ST_index', 'CONTBR_EMPLOYER_index', 
                     "CONTBR_OCCUPATION_index"]
prediction_column = ['CONTB_RECEIPT_AMT']


In [None]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep").fit(dataset) for column in categorical_columns ]
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(dataset).transform(dataset)

In [None]:
assembler = VectorAssembler(inputCols=numeric_columns + categorical_index, outputCol="features")
fv = assembler.transform(df_r.na.drop())

In [None]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(fv)


In [None]:

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = fv.randomSplit([0.7, 0.3])


In [None]:

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures", maxBins=12000)

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])


In [None]:
# Train model.  This also runs the indexers.


trainingData = trainingData.withColumn("label",trainingData.CONTB_RECEIPT_AMT)


model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)


In [None]:

# Select example rows to display.
#predictions.select("prediction", "indexedLabel", "features").show(5)

predictions.select('CONTB_RECEIPT_AMT', 'prediction').show(100)


In [None]:

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="CONTB_RECEIPT_AMT", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only


In [None]:
rfModel.featureImportances

In [None]:
print(numeric_columns + categorical_columns)

We see the following Variables in Order of Importnace
1. CONTBR_ST
2. LASTNAME
3. FIRSTNAME
4. CONTBR_EMPLOYER
5. CONTBR_OCCUPATION

LAT, LONG, and CAND_NM had virtually no impact.

In [None]:

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="CONTB_RECEIPT_AMT", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % r2)



Negative R squared means our data fit worse than the null hypothesis.