## Data Transformation from JSON to Dataframe

### All Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode, col, arrays_zip
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import pyspark.sql.functions as F
from pyspark.sql.functions import sum,avg,max
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
import findspark
findspark.init()

### Spark Session

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .appName("data_cleaning") \
    .getOrCreate()

### Read File

In [3]:
json_data = spark.read.option("multiline","true").json(["./Data/JSON/drug-event-0033-of-0034.json"])
# json_data = spark.read.option("multiline","true").json("./Data/JSON")

In [4]:
json_data.printSchema()

root
 |-- meta: struct (nullable = true)
 |    |-- disclaimer: string (nullable = true)
 |    |-- last_updated: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- results: struct (nullable = true)
 |    |    |-- limit: long (nullable = true)
 |    |    |-- skip: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |-- terms: string (nullable = true)
 |-- results: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- authoritynumb: string (nullable = true)
 |    |    |-- companynumb: string (nullable = true)
 |    |    |-- duplicate: string (nullable = true)
 |    |    |-- fulfillexpeditecriteria: string (nullable = true)
 |    |    |-- occurcountry: string (nullable = true)
 |    |    |-- patient: struct (nullable = true)
 |    |    |    |-- drug: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- actiondrug: string (nullable = true)
 |    

In [6]:
exploded_results = json_data.select(explode(F.col("results")).alias("exploded_results"))

In [9]:
all_keys = []

### Converting Nested JSON Data into Columns

In [10]:
# temp_data.withColumn("keys", F.json_object_keys(temp_data.exploded_array)).show()
keys = exploded_results.select(F.col("exploded_results.*")).columns
keys = ["exploded_results."+str(i) for i in keys]
all_keys.extend(keys)

In [11]:
patient_keys = exploded_results.select(F.col("exploded_results.patient.*")).columns
patient_keys = ["exploded_results.patient."+str(i) for i in patient_keys]
all_keys.extend(patient_keys)
updated_data = exploded_results.select(all_keys)
updated_data = updated_data.drop(F.col("patient"))
all_keys = updated_data.columns

In [13]:
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_drug",F.explode(F.col("drug")))
drug_keys = updated_data.select(F.col("explode_drug.*")).columns
drug_keys = ["explode_drug."+i for i in drug_keys]
all_keys.extend(drug_keys)

In [14]:
updated_data = updated_data.select(all_keys)

In [15]:
updated_data = updated_data.drop(*['authoritynumb','duplicate','reportduplicate','patientagegroup','patientweight','summary'])

In [16]:
updated_data = updated_data.where(F.col("drugindication") != 'NULL')

In [17]:
updated_data = updated_data.where(F.col("drugindication") != "Product used for unknown indication")

In [20]:
all_keys = updated_data.columns

In [21]:
all_keys = updated_data.columns
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_reaction",F.explode(F.col("reaction")))
all_keys = updated_data.columns
reaction_keys = updated_data.select(F.col("explode_reaction.*")).columns
reaction_keys = ["explode_reaction."+i for i in reaction_keys]
all_keys.extend(reaction_keys)

In [22]:
updated_data = updated_data.select(all_keys)

In [23]:
updated_data = updated_data.where(F.col("reactionmeddrapt") != 'NULL')

In [24]:
updated_data = updated_data.select(["seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage","reactionmeddrapt","reactionoutcome","drugindication","activesubstance.activesubstancename","medicinalproduct","openfda.route","openfda.brand_name","openfda.generic_name"])

In [26]:
updated_data = updated_data.dropna()

In [27]:
data = updated_data.where(F.col("reactionmeddrapt") != 'Off label use')\
                            .where(F.col("drugindication") != 'Off label use')

In [28]:
for c in ["activesubstancename","medicinalproduct","drugindication","reactionmeddrapt"]:
    grouped_data = data.groupby(F.col(c)).count()
    filtered_data = grouped_data.orderBy("count",ascending=False).limit(10)
    data = data.join(filtered_data.select(c),on=c, how="inner")

## Data Encoding

In [35]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.functions import vector_to_array

In [38]:
data.printSchema()

root
 |-- reactionmeddrapt: string (nullable = true)
 |-- drugindication: string (nullable = true)
 |-- medicinalproduct: string (nullable = true)
 |-- activesubstancename: string (nullable = true)
 |-- seriousnessdeath: string (nullable = true)
 |-- seriousnesslifethreatening: string (nullable = true)
 |-- seriousnesshospitalization: string (nullable = true)
 |-- seriousnessdisabling: string (nullable = true)
 |-- seriousnesscongenitalanomali: string (nullable = true)
 |-- seriousnessother: string (nullable = true)
 |-- patientonsetage: string (nullable = true)
 |-- reactionoutcome: string (nullable = true)
 |-- route: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- brand_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- generic_name: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [39]:
indexer_list = []
for c in ["activesubstancename","medicinalproduct","drugindication"]:
    indexer = StringIndexer(inputCol=c, outputCol="op_{}".format(c))
    indexer_list.append(indexer)

In [40]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexer_list)
pipeline_model = pipeline.fit(data)

In [41]:
data = pipeline_model.transform(data)

## Random Forest Predicting reactionoutcome 

In [None]:
data.printSchema()

In [42]:
from pyspark.sql.types import IntegerType
data = data.withColumn("op_medicinalproduct",F.col("op_medicinalproduct").cast(IntegerType()))\
    .withColumn("op_activesubstancename",F.col("op_activesubstancename").cast(IntegerType()))\
    .withColumn("op_drugindication",F.col("op_drugindication").cast(IntegerType()))\
    .withColumn("seriousnessdeath",F.col("seriousnessdeath").cast(IntegerType()))\
    .withColumn("seriousnesslifethreatening",F.col("seriousnesslifethreatening").cast(IntegerType()))\
    .withColumn("seriousnesshospitalization",F.col("seriousnesshospitalization").cast(IntegerType()))\
    .withColumn("seriousnessdisabling",F.col("seriousnessdisabling").cast(IntegerType()))\
    .withColumn("seriousnesscongenitalanomali",F.col("seriousnesscongenitalanomali").cast(IntegerType()))\
    .withColumn("seriousnessother",F.col("seriousnessother").cast(IntegerType()))\
    .withColumn("patientonsetage",F.col("patientonsetage").cast(IntegerType()))\
    .withColumn("reactionoutcome",F.col("reactionoutcome").cast(IntegerType()))

In [43]:
# data = data.drop(*["route","brand_name","generic_name","reactionoutcome","drugindication","medicinalproduct","activesubstancename"])

In [44]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
assembler = VectorAssembler(
    inputCols=["op_activesubstancename", "op_medicinalproduct", "op_drugindication","seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage"],
    outputCol="features"
)

data = assembler.transform(data)

In [45]:
# data.select(["op_activesubstancename", "op_medicinalproduct", "op_drugindication","features"]).show()

In [46]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

labelIndexer = StringIndexer(inputCol="reactionoutcome", outputCol="label").fit(data)
featureIndexer =\
    VectorIndexer(inputCol="feature", outputCol="features", maxCategories=4).fit(data)


In [47]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [48]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)

In [49]:
# trainingData.count()

In [50]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
# pipeline = Pipeline(stages=indexer_list)

In [51]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [52]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+---------------+--------------------+
|prediction|reactionoutcome|            features|
+----------+---------------+--------------------+
|       5.0|              5|[0.0,0.0,7.0,1.0,...|
|       5.0|              5|[0.0,0.0,2.0,1.0,...|
|       5.0|              5|[0.0,0.0,2.0,1.0,...|
|       5.0|              5|[0.0,0.0,2.0,1.0,...|
|       5.0|              5|[0.0,0.0,2.0,1.0,...|
+----------+---------------+--------------------+
only showing top 5 rows



In [53]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.071775


## Random Forest for predicting Predicting Reaction

In [38]:
from pyspark.sql.types import IntegerType
data = data.withColumn("op_medicinalproduct",F.col("op_medicinalproduct").cast(IntegerType()))\
    .withColumn("op_activesubstancename",F.col("op_activesubstancename").cast(IntegerType()))\
    .withColumn("op_drugindication",F.col("op_drugindication").cast(IntegerType()))\
    .withColumn("seriousnessdeath",F.col("seriousnessdeath").cast(IntegerType()))\
    .withColumn("seriousnesslifethreatening",F.col("seriousnesslifethreatening").cast(IntegerType()))\
    .withColumn("seriousnesshospitalization",F.col("seriousnesshospitalization").cast(IntegerType()))\
    .withColumn("seriousnessdisabling",F.col("seriousnessdisabling").cast(IntegerType()))\
    .withColumn("seriousnesscongenitalanomali",F.col("seriousnesscongenitalanomali").cast(IntegerType()))\
    .withColumn("seriousnessother",F.col("seriousnessother").cast(IntegerType()))\
    .withColumn("patientonsetage",F.col("patientonsetage").cast(IntegerType()))

In [39]:
# data = data.drop(*["route","brand_name","generic_name","reactionoutcome","drugindication","medicinalproduct","activesubstancename"])

In [40]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
assembler = VectorAssembler(
    inputCols=["op_activesubstancename", "op_medicinalproduct", "op_drugindication","seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage"],
    outputCol="features"
)

data = assembler.transform(data)

In [41]:
# data.select(["op_activesubstancename", "op_medicinalproduct", "op_drugindication","features"]).show()

In [66]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="reactionmeddrapt", outputCol="indexedLabel").fit(data)
# indexer_list.append(labelIndexer)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
# featureIndexer =\
#     VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)


In [67]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [68]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=100)

In [69]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [70]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])
# pipeline = Pipeline(stages=indexer_list)

In [71]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [72]:
# Select example rows to display.
predictions.select("predictedLabel", "reactionmeddrapt", "features").show(5)

+--------------------+----------------+--------------------+
|      predictedLabel|reactionmeddrapt|            features|
+--------------------+----------------+--------------------+
|Systemic lupus er...|        Alopecia|[0.0,0.0,2.0,0.0,...|
|Systemic lupus er...|        Alopecia|[0.0,0.0,2.0,0.0,...|
|Systemic lupus er...|        Alopecia|[0.0,0.0,5.0,0.0,...|
|Systemic lupus er...|        Alopecia|[0.0,0.0,5.0,0.0,...|
|           Synovitis|        Alopecia|[6.0,5.0,3.0,0.0,...|
+--------------------+----------------+--------------------+
only showing top 5 rows



In [73]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.892457


In [74]:
rfModel = model.stages[2]
print(rfModel)  # summary only

RandomForestClassificationModel: uid=RandomForestClassifier_c1adb11429d7, numTrees=100, numClasses=10, numFeatures=10


## Naive Bayes for predciting reaction outcome

In [42]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [43]:
from pyspark.sql.types import IntegerType
data = data.withColumn("op_medicinalproduct",F.col("op_medicinalproduct").cast(IntegerType()))\
    .withColumn("op_activesubstancename",F.col("op_activesubstancename").cast(IntegerType()))\
    .withColumn("op_drugindication",F.col("op_drugindication").cast(IntegerType()))\
    .withColumn("seriousnessdeath",F.col("seriousnessdeath").cast(IntegerType()))\
    .withColumn("seriousnesslifethreatening",F.col("seriousnesslifethreatening").cast(IntegerType()))\
    .withColumn("seriousnesshospitalization",F.col("seriousnesshospitalization").cast(IntegerType()))\
    .withColumn("seriousnessdisabling",F.col("seriousnessdisabling").cast(IntegerType()))\
    .withColumn("seriousnesscongenitalanomali",F.col("seriousnesscongenitalanomali").cast(IntegerType()))\
    .withColumn("seriousnessother",F.col("seriousnessother").cast(IntegerType()))\
    .withColumn("patientonsetage",F.col("patientonsetage").cast(IntegerType()))

In [44]:
# data = data.drop(*["route","brand_name","generic_name","reactionoutcome","drugindication","medicinalproduct","activesubstancename"])

In [45]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
assembler = VectorAssembler(
    inputCols=["op_activesubstancename", "op_medicinalproduct", "op_drugindication","seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage"],
    outputCol="feature"
)

data = assembler.transform(data)

In [46]:
# data.select(["op_activesubstancename", "op_medicinalproduct", "op_drugindication","features"]).show()

In [48]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="reactionoutcome", outputCol="label").fit(data)
# indexer_list.append(labelIndexer)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="feature", outputCol="features", maxCategories=4).fit(data)


In [49]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [50]:
nb = NaiveBayes(featuresCol="features", labelCol="label", modelType="multinomial")

In [51]:
# model = nb.fit(data)

In [52]:
# predictions = model.transform(data)

In [53]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [54]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, nb, labelConverter])
# pipeline = Pipeline(stages=indexer_list)

In [55]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [56]:
# Select example rows to display.
predictions.select("predictedLabel", "reactionoutcome", "features").show(5)

+--------------+---------------+--------------------+
|predictedLabel|reactionoutcome|            features|
+--------------+---------------+--------------------+
|             2|              5|[0.0,0.0,7.0,0.0,...|
|             5|              5|[0.0,0.0,2.0,0.0,...|
|             5|              5|[0.0,0.0,2.0,0.0,...|
|             5|              5|[0.0,0.0,2.0,0.0,...|
|             5|              5|[0.0,0.0,2.0,0.0,...|
+--------------+---------------+--------------------+
only showing top 5 rows



In [57]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))


Test Error = 0.2


## MLP for prediction reaction outcome

In [42]:
from pyspark.sql.types import IntegerType
data = data.withColumn("op_medicinalproduct",F.col("op_medicinalproduct").cast(IntegerType()))\
    .withColumn("op_activesubstancename",F.col("op_activesubstancename").cast(IntegerType()))\
    .withColumn("op_drugindication",F.col("op_drugindication").cast(IntegerType()))\
    .withColumn("seriousnessdeath",F.col("seriousnessdeath").cast(IntegerType()))\
    .withColumn("seriousnesslifethreatening",F.col("seriousnesslifethreatening").cast(IntegerType()))\
    .withColumn("seriousnesshospitalization",F.col("seriousnesshospitalization").cast(IntegerType()))\
    .withColumn("seriousnessdisabling",F.col("seriousnessdisabling").cast(IntegerType()))\
    .withColumn("seriousnesscongenitalanomali",F.col("seriousnesscongenitalanomali").cast(IntegerType()))\
    .withColumn("seriousnessother",F.col("seriousnessother").cast(IntegerType()))\
    .withColumn("patientonsetage",F.col("patientonsetage").cast(IntegerType()))

In [43]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
assembler = VectorAssembler(
    inputCols=["op_activesubstancename", "op_medicinalproduct", "op_drugindication","seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage"],
    outputCol="feature"
)

data = assembler.transform(data)

In [44]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

labelIndexer = StringIndexer(inputCol="reactionoutcome", outputCol="label").fit(data)

featureIndexer =\
    VectorIndexer(inputCol="feature", outputCol="features", maxCategories=4).fit(data)

In [45]:
data = labelIndexer.transform(data)
data = featureIndexer.transform(data)

In [46]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [47]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [49]:
from pyspark.ml.linalg import Vectors

mlp = MultilayerPerceptronClassifier(layers=[10, 20, 6], seed=123)
mlp.setMaxIter(500)
mlp.getMaxIter()
mlp.getBlockSize()
mlp.setBlockSize(1)
mlp.getBlockSize()
model = mlp.fit(trainingData)
model.setFeaturesCol("features")
# testData.head().features
# trainingData.head().features
# model.predict(testData.head().features)
# model.predictRaw(testData.head().features)
# model.predictProbability(testData.head().features)

MultilayerPerceptronClassificationModel: uid=MultilayerPerceptronClassifier_e7791adbd601, numLayers=3, numClasses=6, numFeatures=10

In [50]:
model.predictProbability(testData.head().features)

DenseVector([1.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [51]:
predictions = model.transform(testData)

In [55]:
predictions.select("features", "prediction","label").show(5)

+--------------------+----------+-----+
|            features|prediction|label|
+--------------------+----------+-----+
|[0.0,0.0,7.0,0.0,...|       0.0|  0.0|
|[0.0,0.0,2.0,0.0,...|       0.0|  0.0|
|[0.0,0.0,2.0,0.0,...|       0.0|  1.0|
|[0.0,0.0,2.0,0.0,...|       0.0|  0.0|
|[0.0,0.0,8.0,0.0,...|       0.0|  0.0|
+--------------------+----------+-----+
only showing top 5 rows



In [56]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.088644


## MLP Reaction

In [75]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [76]:
data = labelIndexer.transform(data)
data = featureIndexer.transform(data)

In [77]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [78]:
trainingData.head().features

DenseVector([0.0, 0.0, 7.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 44.0])

In [79]:
from pyspark.ml.linalg import Vectors

mlp = MultilayerPerceptronClassifier(layers=[10, 20, 10], seed=123)
mlp.setMaxIter(500)
mlp.getMaxIter()
mlp.getBlockSize()
mlp.setBlockSize(1)
mlp.getBlockSize()
model = mlp.fit(trainingData)
model.setFeaturesCol("features")

Py4JJavaError: An error occurred while calling o1491.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 943.0 failed 1 times, most recent failure: Lost task 0.0 in stage 943.0 (TID 836) (Akshay executor driver): java.lang.RuntimeException: Labels MUST be in [0, 1), but got 1.0
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage43.project_doConsume_1$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage43.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:223)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1597)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1524)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1588)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
	at org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:379)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1296)
	at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:195)
	at org.apache.spark.mllib.optimization.LBFGS.optimizeWithLossReturned(LBFGS.scala:154)
	at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:855)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.$anonfun$train$1(MultilayerPerceptronClassifier.scala:233)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:185)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:94)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.RuntimeException: Labels MUST be in [0, 1), but got 1.0
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage43.project_doConsume_1$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage43.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:223)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1597)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1524)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1588)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
	at org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:379)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
