# Random Forests: Presidential Contributions

Let's look at a random forests models for the presidential dataset.

We are going to try to predict two variables:

1. Amount of contribution (regression)
2. Candidate of Contribution (classification).

In [7]:
%matplotlib inline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import isnan, when, count, col, split, trim, countDistinct, abs 
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType

import pyspark.sql.functions



In [2]:
dataset = spark.read.csv("/data/presidential_election_contribs/2016/2016-medium-clean.csv", header=True, inferSchema=True)


In [3]:
dataset.show()

+-----------------+--------------------+------------+------------+---------+---------+-----------+--------------------+--------------------+
|CONTB_RECEIPT_AMT|             CAND_NM|    LASTNAME|   FIRSTNAME|CONTBR_ST|      LAT|        LNG|     CONTBR_EMPLOYER|   CONTBR_OCCUPATION|
+-----------------+--------------------+------------+------------+---------+---------+-----------+--------------------+--------------------+
|              5.0|Clinton, Hillary ...|      RIGNEY|     FARRELL|       CA|33.147294|-117.322181|       SELF-EMPLOYED|          CONTRACTOR|
|            100.0|    Sanders, Bernard|      ARNOLD|         IRA|       CA| 38.34642|-122.694127|                NONE|        NOT EMPLOYED|
|             24.0|Cruz, Rafael Edwa...|    VANDOREN|HELEN E. MS.|       MD|39.002745| -76.931721|             RETIRED|             RETIRED|
|            100.0|Clinton, Hillary ...|    RICHARDS|        MARC|       CA| 34.07041|-118.350411|       SELF-EMPLOYED|                 ART|
|            

In [5]:

feature_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'LAT', 'LNG', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
numeric_columns = ['LAT', 'LNG']
categorical_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_index = ['CAND_NM_index', 'FIRSTNAME_index', 'LASTNAME_index', 'CONTBR_ST_index', 'CONTBR_EMPLOYER_index', 
                     "CONTBR_OCCUPATION_index"]
prediction_column = ['CONTB_RECEIPT_AMT']


In [11]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep").fit(dataset) for column in categorical_columns ]
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(dataset).transform(dataset)

In [12]:
assembler = VectorAssembler(inputCols=numeric_columns + categorical_index, outputCol="features")
fv = assembler.transform(df_r.na.drop())

In [14]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(fv)


In [15]:

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = fv.randomSplit([0.7, 0.3])


In [16]:

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures", maxBins=12000)

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])


In [17]:
# Train model.  This also runs the indexers.


trainingData = trainingData.withColumn("label",trainingData.CONTB_RECEIPT_AMT)


model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)


In [None]:

# Select example rows to display.
#predictions.select("prediction", "indexedLabel", "features").show(5)

predictions.select('CONTB_RECEIPT_AMT', 'prediction').show(100)


In [18]:

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="CONTB_RECEIPT_AMT", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only


Root Mean Squared Error (RMSE) on test data = 3625.69
RandomForestRegressionModel (uid=RandomForestRegressor_4c94ae6bc47a138b2a88) with 20 trees


In [19]:
rfModel.featureImportances

SparseVector(8, {0: 0.0031, 1: 0.0011, 2: 0.0094, 3: 0.193, 4: 0.5185, 5: 0.0155, 6: 0.2121, 7: 0.0472})

In [20]:
print(numeric_columns + categorical_columns)

['LAT', 'LNG', 'CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', 'CONTBR_OCCUPATION']


We see the following Variables in Order of Importnace
1. CONTBR_ST
2. LASTNAME
3. FIRSTNAME
4. CONTBR_EMPLOYER
5. CONTBR_OCCUPATION

LAT, LONG, and CAND_NM had virtually no impact.

In [None]:

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="CONTB_RECEIPT_AMT", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % r2)



Negative R squared means our data fit worse than the null hypothesis.