# Random Forests: Presidential Contributions

Let's look at a random forests models for the presidential dataset.

We are going to try to predict two variables:

1. Amount of contribution (regression)
2. Candidate of Contribution (classification).

In [28]:
%matplotlib inline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import isnan, when, count, col, split, trim, countDistinct, abs 
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType

import pyspark.sql.functions



In [11]:
dataset = spark.read.csv("/data/presidential_election_contribs/2016/2016-medium-10k.csv", header=True, inferSchema=True)


In [12]:
zipcodes = spark.read.csv('/data/zipcodes/zipcodes.csv.gz', header=True, inferSchema=True)

In [7]:
zipcodes.show()

+---+---------+----------+
|ZIP|      LAT|       LNG|
+---+---------+----------+
|601|18.180555|-66.749961|
|602|18.361945|-67.175597|
|603|18.455183|-67.119887|
|606|18.158345|-66.932911|
|610|18.295366|-67.125135|
|612|18.402253|-66.711397|
|616|18.420412|-66.671979|
|617|18.445147|-66.559696|
|622|17.991245|-67.153993|
|623|18.083361|-67.153897|
|624|18.064919|-66.716683|
|627|  18.4126|-66.863926|
|631|18.190607|-66.832041|
|637|18.076713|-66.947389|
|638|18.295913|-66.515588|
|641|18.263085|-66.712985|
|646| 18.43315|-66.285875|
|647|17.963613|-66.947127|
|650|18.349416|-66.578079|
|652|18.448452|-66.594127|
+---+---------+----------+
only showing top 20 rows



In [18]:

feature_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'LAT', 'LNG', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
numeric_columns = ['LAT', 'LNG']
categorical_columns = ['CAND_NM', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_index = ['CAND_NM_index', 'FIRSTNAME_index', 'LASTNAME_index', 'CONTBR_ST_index', 'CONTBR_EMPLOYER_index', 
                     "CONTBR_OCCUPATION_index"]
prediction_column = ['CONTB_RECEIPT_AMT']
prediction_column_donation = ['CAND_NM']
feature_columns_donation = ['LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'LAT', 'LNG', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_columns_donation = ['LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', "CONTBR_OCCUPATION"]
categorical_index_donation = ['FIRSTNAME_index', 'LASTNAME_index', 'CONTBR_ST_index', 'CONTBR_EMPLOYER_index', 
                     "CONTBR_OCCUPATION_index"]

In [13]:
split_col = split(dataset['CONTBR_NM'], ',')
dataset = dataset.withColumn('LASTNAME', trim(split_col.getItem(0)))
dataset = dataset.withColumn('FIRSTNAME', trim(split_col.getItem(1)))
dataset = dataset.withColumn('ZIP5DIG', dataset['CONTBR_ZIP'].substr(0,5).cast(IntegerType()))
dataset = dataset.withColumn('CONTB_RECEIPT_AMT', pyspark.sql.functions.abs(dataset['CONTB_RECEIPT_AMT']))

In [14]:
joined = dataset.join(zipcodes, dataset.ZIP5DIG == zipcodes.ZIP)

In [15]:
joined.show()

+---------+---------+--------------------+--------------------+----------------+---------+----------+--------------------+--------------------+-----------------+----------------+------------+-------+--------------------+-------+--------+-------------+-----------+------------+------------+-------+-----+---------+-----------+
|  CMTE_ID|  CAND_ID|             CAND_NM|           CONTBR_NM|     CONTBR_CITY|CONTBR_ST|CONTBR_ZIP|     CONTBR_EMPLOYER|   CONTBR_OCCUPATION|CONTB_RECEIPT_AMT|CONTB_RECEIPT_DT|RECEIPT_DESC|MEMO_CD|           MEMO_TEXT|FORM_TP|FILE_NUM|      TRAN_ID|ELECTION_TP|    LASTNAME|   FIRSTNAME|ZIP5DIG|  ZIP|      LAT|        LNG|
+---------+---------+--------------------+--------------------+----------------+---------+----------+--------------------+--------------------+-----------------+----------------+------------+-------+--------------------+-------+--------+-------------+-----------+------------+------------+-------+-----+---------+-----------+
|C00575795|P00003392|C

In [None]:
joined.groupBy('CAND_NM').count().show(40)

In [None]:
joined.groupBy('CONTBR_OCCUPATION').count().filter("`count` >= 10").sort('count', ascending=False).show()

In [None]:
joined.agg(*(countDistinct(col(c)).alias(c) for c in categorical_columns)).show()


In [None]:
donations = joined.select(prediction_column + feature_columns).na.fill('Unknown')
donations.show()

In [None]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep").fit(donations) for column in categorical_columns ]


pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(donations).transform(donations)

df_r.show()

In [None]:
df_r.select([count(when(isnan(c), c)).alias(c) for c in df_r.columns]).show()

In [None]:
df_r.filter('CONTB_RECEIPT_AMT < 0').show()

In [None]:
assembler = VectorAssembler(inputCols=numeric_columns + categorical_index, outputCol="features")
fv = assembler.transform(df_r.na.drop())

In [None]:
fv.show()

In [None]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(fv)


In [None]:

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = fv.randomSplit([0.7, 0.3])


In [None]:

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures", maxBins=12000)

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])


In [None]:
# Train model.  This also runs the indexers.


trainingData = trainingData.withColumn("label",trainingData.CONTB_RECEIPT_AMT)


model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)


In [None]:

# Select example rows to display.
#predictions.select("prediction", "indexedLabel", "features").show(5)

predictions.select('CONTB_RECEIPT_AMT', 'prediction').show(100)


In [None]:

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="CONTB_RECEIPT_AMT", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only


In [None]:
rfModel.featureImportances

In [None]:
print(numeric_columns + categorical_columns)

We see the following Variables in Order of Importnace
1. CONTBR_ST
2. LASTNAME
3. FIRSTNAME
4. CONTBR_EMPLOYER
5. CONTBR_OCCUPATION

LAT, LONG, and CAND_NM had virtually no impact.

In [None]:

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="CONTB_RECEIPT_AMT", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % r2)



Negative R squared means our data fit worse than the null hypothesis.

## Classification

Let's try classifying the candidate donated to


In [43]:
joined.groupBy('CAND_NM').count().show(40)

+--------------------+-----+
|             CAND_NM|count|
+--------------------+-----+
|        Rubio, Marco|  128|
|      Fiorina, Carly|   35|
|Christie, Christo...|    7|
|         Stein, Jill|   11|
|    Sanders, Bernard| 2506|
|      McMullin, Evan|    6|
|      Huckabee, Mike|    8|
|       Walker, Scott|    5|
|  Graham, Lindsey O.|    5|
|     Kasich, John R.|   42|
|O'Malley, Martin ...|    9|
|Santorum, Richard J.|    3|
| Carson, Benjamin S.|  343|
|Webb, James Henry...|    1|
|    Lessig, Lawrence|    3|
|          Paul, Rand|   42|
|   Pataki, George E.|    1|
|       Johnson, Gary|   11|
|Clinton, Hillary ...| 4355|
|Perry, James R. (...|    2|
|    Trump, Donald J.|  935|
|Cruz, Rafael Edwa...|  722|
|           Bush, Jeb|   41|
+--------------------+-----+



In [38]:
donations2 = joined.select(prediction_column + feature_columns).na.fill('Unknown')
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep").fit(donations2) for column in categorical_columns ]


pipeline = Pipeline(stages=indexers)
df_r2 = pipeline.fit(donations2).transform(donations2)
assembler2 = VectorAssembler(inputCols=numeric_columns + categorical_index_donation, outputCol="features")
fv2 = assembler2.transform(df_r2.na.drop())

fv2 = fv2.withColumn("label",fv2.CAND_NM)


# Split the data into training and test sets (30% held out for testing)
(trainingData2, testData2) = fv2.randomSplit([0.7, 0.3])


# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(fv2)


# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer2 =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(fv2)


# Train a RandomForest model.
rf2 = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=100, maxBins=10000)

# Chain indexer and forest in a Pipeline
pipeline2 = Pipeline(stages=[labelIndexer, featureIndexer2, rf2])


In [39]:
# Train model.  This also runs the indexers.


model2 = pipeline2.fit(trainingData2)

# Make predictions.
predictions2 = model2.transform(testData2)


In [41]:
predictions2.show()

+-----------------+--------------------+------------+----------+---------+---------+-----------+--------------------+--------------------+-------------+--------------+---------------+---------------+---------------------+-----------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+----------+
|CONTB_RECEIPT_AMT|             CAND_NM|    LASTNAME| FIRSTNAME|CONTBR_ST|      LAT|        LNG|     CONTBR_EMPLOYER|   CONTBR_OCCUPATION|CAND_NM_index|LASTNAME_index|FIRSTNAME_index|CONTBR_ST_index|CONTBR_EMPLOYER_index|CONTBR_OCCUPATION_index|            features|               label|indexedLabel|     indexedFeatures|       rawPrediction|         probability|prediction|
+-----------------+--------------------+------------+----------+---------+---------+-----------+--------------------+--------------------+-------------+--------------+---------------+---------------+---------------------+-----------------------+-----

In [42]:

# Select example rows to display.
predictions2.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator2 = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator2.evaluate(predictions2)
print("Test Error = %g" % (1.0 - accuracy))

rfModel2 = model2.stages[2]
print(rfModel2)  # summary only

+----------+--------------------+--------------------+
|prediction|               label|            features|
+----------+--------------------+--------------------+
|       1.0|Clinton, Hillary ...|[29.75415,-95.409...|
|       0.0|Clinton, Hillary ...|[47.63714,-122.32...|
|       0.0|Clinton, Hillary ...|[31.051688,-97.49...|
|       0.0|Clinton, Hillary ...|[25.753332,-80.27...|
|       0.0|Clinton, Hillary ...|[37.415727,-122.1...|
+----------+--------------------+--------------------+
only showing top 5 rows

Test Error = 0.490339
RandomForestClassificationModel (uid=RandomForestClassifier_4b768ea09ecb9f0481be) with 100 trees


In [46]:
predictions2.groupBy('label').pivot('prediction', range(0,22)).count().na.fill(0).orderBy('label').show()

+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|               label|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21|
+--------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|           Bush, Jeb|  4|  0|  8|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
| Carson, Benjamin S.| 11|  7| 66| 14|  2|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|Christie, Christo...|  1|  0|  3|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|Clinton, Hillary ...|791| 95|367| 43|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|Cruz, Rafael Edwa...| 26| 27|115| 56|  4|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|      Fiorina, Carly|  1|  1|  9|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|

In [52]:
rfModel2.featureImportances

SparseVector(7, {0: 0.0155, 1: 0.016, 2: 0.06, 3: 0.0084, 4: 0.0341, 5: 0.5551, 6: 0.3109})

In [54]:
print(numeric_columns + categorical_columns_donation)

['LAT', 'LNG', 'LASTNAME', 'FIRSTNAME', 'CONTBR_ST', 'CONTBR_EMPLOYER', 'CONTBR_OCCUPATION']


## Most important Fields
1. Employer
2. Occupation
3. LastName
4. State

Other fields not significant