## Data Transformation from JSON to Dataframe

### All Imports

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode, col, arrays_zip
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import pyspark.sql.functions as F
from pyspark.sql.functions import sum,avg,max

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("data_cleaning") \
    .getOrCreate()

json_data = spark.read.option("multiline","true").json("gs://akshayparatelivesinjerseycity/test_data/")
exploded_results = json_data.select(explode(F.col("results")).alias("exploded_results"))
all_keys = []
keys = exploded_results.select(F.col("exploded_results.*")).columns
keys = ["exploded_results."+str(i) for i in keys]
all_keys.extend(keys)
patient_keys = exploded_results.select(F.col("exploded_results.patient.*")).columns
patient_keys = ["exploded_results.patient."+str(i) for i in patient_keys]
all_keys.extend(patient_keys)
updated_data = exploded_results.select(all_keys)
updated_data = updated_data.drop(F.col("patient"))
all_keys = updated_data.columns
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_drug",F.explode(F.col("drug")))
drug_keys = updated_data.select(F.col("explode_drug.*")).columns
drug_keys = ["explode_drug."+i for i in drug_keys]
all_keys.extend(drug_keys)
updated_data = updated_data.select(all_keys)
updated_data = updated_data.drop(*['authoritynumb','duplicate','reportduplicate','patientagegroup','patientweight','summary'])
updated_data = updated_data.where(F.col("drugindication") != 'NULL')
updated_data = updated_data.where(F.col("drugindication") != "Product used for unknown indication")
all_keys = updated_data.columns
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_reaction",F.explode(F.col("reaction")))
all_keys = updated_data.columns
reaction_keys = updated_data.select(F.col("explode_reaction.*")).columns
reaction_keys = ["explode_reaction."+i for i in reaction_keys]
all_keys.extend(reaction_keys)
updated_data = updated_data.select(all_keys)
updated_data = updated_data.where(F.col("reactionmeddrapt") != 'NULL')
updated_data = updated_data.select(["seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage","reactionmeddrapt","reactionoutcome","drugindication","activesubstance.activesubstancename","medicinalproduct"])
updated_data = updated_data.dropna()
data = updated_data.where(F.col("reactionmeddrapt") != 'Off label use')\
                            .where(F.col("drugindication") != 'Off label use')
for c in ["activesubstancename","medicinalproduct","drugindication","reactionmeddrapt"]:
    grouped_data = data.groupby(F.col(c)).count()
    grouped_data = grouped_data.groupBy(c).agg(F.avg("count").alias("avg_count"))
    overall_avg_df = grouped_data.agg(F.avg("avg_count").alias("overall_avg"))
    filtered_data = grouped_data.crossJoin(overall_avg_df).filter(F.col("avg_count") > F.col("overall_avg"))
    data = data.join(filtered_data.select(c),on=c, how="inner")
data.write.csv("gs://akshayparatelivesinjerseycity/test.csv", header=True, mode="overwrite")

## Data Encoding

In [35]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.functions import vector_to_array

In [36]:
# spark2 = SparkSession.builder \
#     .master("local") \
#     .config("spark.driver.memory", "12g") \
#     .config("spark.executor.memory", "12g") \
#     .appName("data_encoding") \
#     .getOrCreate()

In [37]:
# data = spark.read.csv("./transformed_data.csv",header=True)
# data = spark2.sql("SELECT * FROM transformed_data")

In [38]:
data.printSchema()

root
 |-- reactionmeddrapt: string (nullable = true)
 |-- drugindication: string (nullable = true)
 |-- medicinalproduct: string (nullable = true)
 |-- activesubstancename: string (nullable = true)
 |-- seriousnessdeath: string (nullable = true)
 |-- seriousnesslifethreatening: string (nullable = true)
 |-- seriousnesshospitalization: string (nullable = true)
 |-- seriousnessdisabling: string (nullable = true)
 |-- seriousnesscongenitalanomali: string (nullable = true)
 |-- seriousnessother: string (nullable = true)
 |-- patientonsetage: string (nullable = true)
 |-- reactionoutcome: string (nullable = true)
 |-- route: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- brand_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- generic_name: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [39]:
indexer_list = []
for c in ["activesubstancename","medicinalproduct","drugindication"]:
    indexer = StringIndexer(inputCol=c, outputCol="op_{}".format(c))
    indexer_list.append(indexer)

In [40]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexer_list)
pipeline_model = pipeline.fit(data)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\site-packages\pyspark\errors\exceptions\captured.py", line 179, in deco
    return f(*a, **kw)
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\site-packages\py4j\protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: <exception str() failed>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  

Py4JError: py4j does not exist in the JVM

In [None]:
data = pipeline_model.transform(data)

## Random Forest Predicting Reaction

In [45]:
# spark = SparkSession.builder \
#     .master("local") \
#     .config("spark.driver.memory", "12g") \
#     .config("spark.executor.memory", "12g") \
#     .appName("random_forest") \
#     .getOrCreate()

In [46]:
# data = spark.read.csv("./encoded.csv",header=True)

In [37]:
data.printSchema()

root
 |-- reactionmeddrapt: string (nullable = true)
 |-- drugindication: string (nullable = true)
 |-- medicinalproduct: string (nullable = true)
 |-- activesubstancename: string (nullable = true)
 |-- seriousnessdeath: string (nullable = true)
 |-- seriousnesslifethreatening: string (nullable = true)
 |-- seriousnesshospitalization: string (nullable = true)
 |-- seriousnessdisabling: string (nullable = true)
 |-- seriousnesscongenitalanomali: string (nullable = true)
 |-- seriousnessother: string (nullable = true)
 |-- patientonsetage: string (nullable = true)
 |-- reactionoutcome: string (nullable = true)
 |-- route: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- brand_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- generic_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- op_activesubstancename: double (nullable = false)
 |-- op_medicinalproduct: double (nullable = false)
 |-- op_drugi

In [45]:
from pyspark.sql.types import IntegerType
data = data.withColumn("op_medicinalproduct",F.col("op_medicinalproduct").cast(IntegerType()))\
    .withColumn("op_activesubstancename",F.col("op_activesubstancename").cast(IntegerType()))\
    .withColumn("op_drugindication",F.col("op_drugindication").cast(IntegerType()))\
    .withColumn("seriousnessdeath",F.col("seriousnessdeath").cast(IntegerType()))\
    .withColumn("seriousnesslifethreatening",F.col("seriousnesslifethreatening").cast(IntegerType()))\
    .withColumn("seriousnesshospitalization",F.col("seriousnesshospitalization").cast(IntegerType()))\
    .withColumn("seriousnessdisabling",F.col("seriousnessdisabling").cast(IntegerType()))\
    .withColumn("seriousnesscongenitalanomali",F.col("seriousnesscongenitalanomali").cast(IntegerType()))\
    .withColumn("seriousnessother",F.col("seriousnessother").cast(IntegerType()))\
    .withColumn("patientonsetage",F.col("patientonsetage").cast(IntegerType()))

In [48]:
# data = data.drop(*["route","brand_name","generic_name","reactionoutcome","drugindication","medicinalproduct","activesubstancename"])

In [49]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
assembler = VectorAssembler(
    inputCols=["op_activesubstancename", "op_medicinalproduct", "op_drugindication","seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage"],
    outputCol="features"
)

data = assembler.transform(data)

In [50]:
# data.select(["op_activesubstancename", "op_medicinalproduct", "op_drugindication","features"]).show()

In [51]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
# indexer_list.append(labelIndexer)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.



In [52]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [53]:
# Train a RandomForest model.


In [54]:
# Convert indexed labels back to original labels.


In [55]:
labelIndexer = StringIndexer(inputCol="reactionmeddrapt", outputCol="indexedLabel").fit(data)

featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])


In [56]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [57]:
# Select example rows to display.
predictions.select("predictedLabel", "reactionmeddrapt", "features").show(5)

+----------------+--------------------+--------------------+
|  predictedLabel|    reactionmeddrapt|            features|
+----------------+--------------------+--------------------+
|       Diarrhoea|Abdominal discomfort|[22.0,21.0,61.0,2...|
|       Diarrhoea|Abdominal discomfort|[15.0,14.0,23.0,2...|
|Drug ineffective|Abdominal discomfort|[28.0,28.0,3.0,2....|
|Drug ineffective|Abdominal discomfort|[28.0,28.0,3.0,2....|
|Drug ineffective|Abdominal discomfort|[29.0,29.0,2.0,2....|
+----------------+--------------------+--------------------+
only showing top 5 rows



In [58]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.893301


In [59]:
rfModel = model.stages[2]
print(rfModel)  # summary only

RandomForestClassificationModel: uid=RandomForestClassifier_b99fe1917272, numTrees=10, numClasses=376, numFeatures=10


## Random Forest Predicting reactionoutcome 

In [None]:
data.printSchema()

In [None]:
from pyspark.sql.types import IntegerType
data = data.withColumn("op_medicinalproduct",F.col("op_medicinalproduct").cast(IntegerType()))\
    .withColumn("op_activesubstancename",F.col("op_activesubstancename").cast(IntegerType()))\
    .withColumn("op_drugindication",F.col("op_drugindication").cast(IntegerType()))\
    .withColumn("seriousnessdeath",F.col("seriousnessdeath").cast(IntegerType()))\
    .withColumn("seriousnesslifethreatening",F.col("seriousnesslifethreatening").cast(IntegerType()))\
    .withColumn("seriousnesshospitalization",F.col("seriousnesshospitalization").cast(IntegerType()))\
    .withColumn("seriousnessdisabling",F.col("seriousnessdisabling").cast(IntegerType()))\
    .withColumn("seriousnesscongenitalanomali",F.col("seriousnesscongenitalanomali").cast(IntegerType()))\
    .withColumn("seriousnessother",F.col("seriousnessother").cast(IntegerType()))\
    .withColumn("patientonsetage",F.col("patientonsetage").cast(IntegerType()))\
    .withColumn("reactionoutcome",F.col("reactionoutcome").cast(IntegerType()))

In [None]:
# data = data.drop(*["route","brand_name","generic_name","reactionoutcome","drugindication","medicinalproduct","activesubstancename"])

In [None]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
assembler = VectorAssembler(
    inputCols=["op_activesubstancename", "op_medicinalproduct", "op_drugindication","seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage"],
    outputCol="features"
)

data = assembler.transform(data)

In [None]:
# data.select(["op_activesubstancename", "op_medicinalproduct", "op_drugindication","features"]).show()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
# labelIndexer = StringIndexer(inputCol="reactionmeddrapt", outputCol="indexedLabel").fit(data)
# indexer_list.append(labelIndexer)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)


In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [None]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="reactionoutcome", featuresCol="indexedFeatures", numTrees=10)

In [None]:
# trainingData.count()

In [None]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
# pipeline = Pipeline(stages=indexer_list)

In [None]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [None]:
# Select example rows to display.
predictions.select("prediction", "reactionoutcome", "features").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="reactionoutcome", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

## Gradient Boosted Tree Classifier

In [51]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [43]:
from pyspark.sql.types import IntegerType
data = data.withColumn("op_medicinalproduct",F.col("op_medicinalproduct").cast(IntegerType()))\
    .withColumn("op_activesubstancename",F.col("op_activesubstancename").cast(IntegerType()))\
    .withColumn("op_drugindication",F.col("op_drugindication").cast(IntegerType()))\
    .withColumn("seriousnessdeath",F.col("seriousnessdeath").cast(IntegerType()))\
    .withColumn("seriousnesslifethreatening",F.col("seriousnesslifethreatening").cast(IntegerType()))\
    .withColumn("seriousnesshospitalization",F.col("seriousnesshospitalization").cast(IntegerType()))\
    .withColumn("seriousnessdisabling",F.col("seriousnessdisabling").cast(IntegerType()))\
    .withColumn("seriousnesscongenitalanomali",F.col("seriousnesscongenitalanomali").cast(IntegerType()))\
    .withColumn("seriousnessother",F.col("seriousnessother").cast(IntegerType()))\
    .withColumn("patientonsetage",F.col("patientonsetage").cast(IntegerType()))

In [44]:
# data = data.drop(*["route","brand_name","generic_name","reactionoutcome","drugindication","medicinalproduct","activesubstancename"])

In [45]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
assembler = VectorAssembler(
    inputCols=["op_activesubstancename", "op_medicinalproduct", "op_drugindication","seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage"],
    outputCol="features"
)

data = assembler.transform(data)

In [46]:
# data.select(["op_activesubstancename", "op_medicinalproduct", "op_drugindication","features"]).show()

In [47]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="reactionmeddrapt", outputCol="indexedLabel").fit(data)
# indexer_list.append(labelIndexer)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)


In [48]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [49]:
# Train a RandomForest model.
# rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

In [52]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [54]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])
# pipeline = Pipeline(stages=indexer_list)

In [55]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

Py4JJavaError: An error occurred while calling o829.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 507.0 failed 1 times, most recent failure: Lost task 0.0 in stage 507.0 (TID 299) (Akshay executor driver): java.lang.RuntimeException: Labels MUST be in {0, 1}, but got 61.0
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage213.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage213.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$SliceIterator.hasNext(Iterator.scala:268)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$take$2(RDD.scala:1492)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1492)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1465)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:119)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:333)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:61)
	at org.apache.spark.ml.classification.GBTClassifier.$anonfun$train$1(GBTClassifier.scala:201)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:170)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:58)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:78)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.RuntimeException: Labels MUST be in {0, 1}, but got 61.0
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage213.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage213.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$SliceIterator.hasNext(Iterator.scala:268)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$take$2(RDD.scala:1492)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [None]:
# Select example rows to display.
predictions.select("predictedLabel", "reactionmeddrapt", "features").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = model.stages[2]
print(gbtModel)  # summary only