## Data Transformation from JSON to Dataframe

### All Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode, col, arrays_zip
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import pyspark.sql.functions as F
from pyspark.sql.functions import sum,avg,max
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
import findspark
findspark.init()

### Spark Session

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .appName("data_cleaning") \
    .getOrCreate()

### Read File

In [3]:
# json_data = spark.read.option("multiline","true").json(["./Data/JSON/drug-event-0019-of-0031.json"])
json_data = spark.read.option("multiline","true").json("./Data/JSON")

In [4]:
json_data.printSchema()

root
 |-- meta: struct (nullable = true)
 |    |-- disclaimer: string (nullable = true)
 |    |-- last_updated: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- results: struct (nullable = true)
 |    |    |-- limit: long (nullable = true)
 |    |    |-- skip: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |-- terms: string (nullable = true)
 |-- results: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- authoritynumb: string (nullable = true)
 |    |    |-- companynumb: string (nullable = true)
 |    |    |-- duplicate: string (nullable = true)
 |    |    |-- fulfillexpeditecriteria: string (nullable = true)
 |    |    |-- occurcountry: string (nullable = true)
 |    |    |-- patient: struct (nullable = true)
 |    |    |    |-- drug: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- actiondrug: string (nullable = true)
 |    

In [5]:
# json_data.show()

In [6]:
exploded_results = json_data.select(explode(F.col("results")).alias("exploded_results"))

In [7]:
# exploded_results.count()
#12000

In [8]:
# exploded_results.show()

In [9]:
all_keys = []

### Converting Nested JSON Data into Columns

In [10]:
# temp_data.withColumn("keys", F.json_object_keys(temp_data.exploded_array)).show()
keys = exploded_results.select(F.col("exploded_results.*")).columns
keys = ["exploded_results."+str(i) for i in keys]
all_keys.extend(keys)

In [11]:
patient_keys = exploded_results.select(F.col("exploded_results.patient.*")).columns
patient_keys = ["exploded_results.patient."+str(i) for i in patient_keys]
all_keys.extend(patient_keys)
updated_data = exploded_results.select(all_keys)
updated_data = updated_data.drop(F.col("patient"))
all_keys = updated_data.columns

In [12]:
# updated_data.count()
#12000

In [13]:
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_drug",F.explode(F.col("drug")))
drug_keys = updated_data.select(F.col("explode_drug.*")).columns
drug_keys = ["explode_drug."+i for i in drug_keys]
all_keys.extend(drug_keys)

In [14]:
updated_data = updated_data.select(all_keys)

In [15]:
updated_data = updated_data.drop(*['authoritynumb','duplicate','reportduplicate','patientagegroup','patientweight','summary'])

In [16]:
updated_data = updated_data.where(F.col("drugindication") != 'NULL')

In [17]:
updated_data = updated_data.where(F.col("drugindication") != "Product used for unknown indication")

In [18]:
# updated_data.cache()

In [19]:
# updated_data.show()

In [20]:
all_keys = updated_data.columns

In [21]:
all_keys = updated_data.columns
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_reaction",F.explode(F.col("reaction")))
all_keys = updated_data.columns
reaction_keys = updated_data.select(F.col("explode_reaction.*")).columns
reaction_keys = ["explode_reaction."+i for i in reaction_keys]
all_keys.extend(reaction_keys)

In [22]:
updated_data = updated_data.select(all_keys)

In [23]:
updated_data = updated_data.where(F.col("reactionmeddrapt") != 'NULL')

In [24]:
updated_data = updated_data.select(["seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage","reactionmeddrapt","reactionoutcome","drugindication","activesubstance.activesubstancename","medicinalproduct","openfda.route","openfda.brand_name","openfda.generic_name"])

In [25]:
# updated_data.cache()

In [26]:
updated_data = updated_data.dropna()

In [27]:
data = updated_data.where(F.col("reactionmeddrapt") != 'Off label use')\
                            .where(F.col("drugindication") != 'Off label use')

In [28]:
for c in ["activesubstancename","medicinalproduct","drugindication","reactionmeddrapt"]:
    grouped_data = data.groupby(F.col(c)).count()
    grouped_data = grouped_data.groupBy(c).agg(F.avg("count").alias("avg_count"))
    overall_avg_df = grouped_data.agg(F.avg("avg_count").alias("overall_avg"))
    filtered_data = grouped_data.crossJoin(overall_avg_df).filter(F.col("avg_count") > F.col("overall_avg"))
    data = data.join(filtered_data.select(c),on=c, how="inner")

In [29]:
# data.createOrReplaceTempView("transformed_data")

## Sample Data Storing

In [42]:
# df_sample = updated_data.sample(withReplacement=False, fraction=0.1)

In [43]:
# pandas_df = data.toPandas()

In [44]:
# pandas_df

In [45]:
# pandas_df["activesubstancename"].value_counts()

In [46]:
# pandas_df.to_csv("transformed_data.csv")

## Data Encoding

In [30]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.functions import vector_to_array

In [31]:
# spark2 = SparkSession.builder \
#     .master("local") \
#     .config("spark.driver.memory", "12g") \
#     .config("spark.executor.memory", "12g") \
#     .appName("data_encoding") \
#     .getOrCreate()

In [32]:
# data = spark.read.csv("./transformed_data.csv",header=True)
# data = spark2.sql("SELECT * FROM transformed_data")

In [33]:
data.printSchema()

root
 |-- reactionmeddrapt: string (nullable = true)
 |-- drugindication: string (nullable = true)
 |-- medicinalproduct: string (nullable = true)
 |-- activesubstancename: string (nullable = true)
 |-- seriousnessdeath: string (nullable = true)
 |-- seriousnesslifethreatening: string (nullable = true)
 |-- seriousnesshospitalization: string (nullable = true)
 |-- seriousnessdisabling: string (nullable = true)
 |-- seriousnesscongenitalanomali: string (nullable = true)
 |-- seriousnessother: string (nullable = true)
 |-- patientonsetage: string (nullable = true)
 |-- reactionoutcome: string (nullable = true)
 |-- route: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- brand_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- generic_name: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [34]:
indexer_list = []
for c in ["activesubstancename","medicinalproduct","drugindication"]:
    indexer = StringIndexer(inputCol=c, outputCol="op_{}".format(c))
    indexer_list.append(indexer)

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexer_list)
pipeline_model = pipeline.fit(data)

In [None]:
data = pipeline_model.transform(data)

## Random Forest Predicting Reaction

In [45]:
# spark = SparkSession.builder \
#     .master("local") \
#     .config("spark.driver.memory", "12g") \
#     .config("spark.executor.memory", "12g") \
#     .appName("random_forest") \
#     .getOrCreate()

In [46]:
# data = spark.read.csv("./encoded.csv",header=True)

In [44]:
data.printSchema()

root
 |-- reactionmeddrapt: string (nullable = true)
 |-- drugindication: string (nullable = true)
 |-- medicinalproduct: string (nullable = true)
 |-- activesubstancename: string (nullable = true)
 |-- seriousnessdeath: integer (nullable = true)
 |-- seriousnesslifethreatening: integer (nullable = true)
 |-- seriousnesshospitalization: integer (nullable = true)
 |-- seriousnessdisabling: integer (nullable = true)
 |-- seriousnesscongenitalanomali: integer (nullable = true)
 |-- seriousnessother: integer (nullable = true)
 |-- patientonsetage: integer (nullable = true)
 |-- reactionoutcome: string (nullable = true)
 |-- route: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- brand_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- generic_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- op_activesubstancename: integer (nullable = true)
 |-- op_medicinalproduct: integer (nullable = true)
 |-- o

In [45]:
from pyspark.sql.types import IntegerType
data = data.withColumn("op_medicinalproduct",F.col("op_medicinalproduct").cast(IntegerType()))\
    .withColumn("op_activesubstancename",F.col("op_activesubstancename").cast(IntegerType()))\
    .withColumn("op_drugindication",F.col("op_drugindication").cast(IntegerType()))\
    .withColumn("seriousnessdeath",F.col("seriousnessdeath").cast(IntegerType()))\
    .withColumn("seriousnesslifethreatening",F.col("seriousnesslifethreatening").cast(IntegerType()))\
    .withColumn("seriousnesshospitalization",F.col("seriousnesshospitalization").cast(IntegerType()))\
    .withColumn("seriousnessdisabling",F.col("seriousnessdisabling").cast(IntegerType()))\
    .withColumn("seriousnesscongenitalanomali",F.col("seriousnesscongenitalanomali").cast(IntegerType()))\
    .withColumn("seriousnessother",F.col("seriousnessother").cast(IntegerType()))\
    .withColumn("patientonsetage",F.col("patientonsetage").cast(IntegerType()))

In [48]:
# data = data.drop(*["route","brand_name","generic_name","reactionoutcome","drugindication","medicinalproduct","activesubstancename"])

In [49]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
assembler = VectorAssembler(
    inputCols=["op_activesubstancename", "op_medicinalproduct", "op_drugindication","seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage"],
    outputCol="features"
)

data = assembler.transform(data)

In [50]:
# data.select(["op_activesubstancename", "op_medicinalproduct", "op_drugindication","features"]).show()

In [51]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="reactionmeddrapt", outputCol="indexedLabel").fit(data)
# indexer_list.append(labelIndexer)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)


In [52]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [53]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)


In [54]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

In [55]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])
# pipeline = Pipeline(stages=indexer_list)

In [56]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [57]:
# Select example rows to display.
predictions.select("predictedLabel", "reactionmeddrapt", "features").show(5)

+----------------+--------------------+--------------------+
|  predictedLabel|    reactionmeddrapt|            features|
+----------------+--------------------+--------------------+
|       Diarrhoea|Abdominal discomfort|[22.0,21.0,61.0,2...|
|       Diarrhoea|Abdominal discomfort|[15.0,14.0,23.0,2...|
|Drug ineffective|Abdominal discomfort|[28.0,28.0,3.0,2....|
|Drug ineffective|Abdominal discomfort|[28.0,28.0,3.0,2....|
|Drug ineffective|Abdominal discomfort|[29.0,29.0,2.0,2....|
+----------------+--------------------+--------------------+
only showing top 5 rows



In [58]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.893301


In [59]:
rfModel = model.stages[2]
print(rfModel)  # summary only

RandomForestClassificationModel: uid=RandomForestClassifier_b99fe1917272, numTrees=10, numClasses=376, numFeatures=10


## Random Forest Predicting reactionoutcome 

In [34]:
data.printSchema()

root
 |-- reactionmeddrapt: string (nullable = true)
 |-- drugindication: string (nullable = true)
 |-- medicinalproduct: string (nullable = true)
 |-- activesubstancename: string (nullable = true)
 |-- seriousnessdeath: string (nullable = true)
 |-- seriousnesslifethreatening: string (nullable = true)
 |-- seriousnesshospitalization: string (nullable = true)
 |-- seriousnessdisabling: string (nullable = true)
 |-- seriousnesscongenitalanomali: string (nullable = true)
 |-- seriousnessother: string (nullable = true)
 |-- patientonsetage: string (nullable = true)
 |-- reactionoutcome: string (nullable = true)
 |-- route: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- brand_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- generic_name: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [35]:
from pyspark.sql.types import IntegerType
data = data.withColumn("op_medicinalproduct",F.col("op_medicinalproduct").cast(IntegerType()))\
    .withColumn("op_activesubstancename",F.col("op_activesubstancename").cast(IntegerType()))\
    .withColumn("op_drugindication",F.col("op_drugindication").cast(IntegerType()))\
    .withColumn("seriousnessdeath",F.col("seriousnessdeath").cast(IntegerType()))\
    .withColumn("seriousnesslifethreatening",F.col("seriousnesslifethreatening").cast(IntegerType()))\
    .withColumn("seriousnesshospitalization",F.col("seriousnesshospitalization").cast(IntegerType()))\
    .withColumn("seriousnessdisabling",F.col("seriousnessdisabling").cast(IntegerType()))\
    .withColumn("seriousnesscongenitalanomali",F.col("seriousnesscongenitalanomali").cast(IntegerType()))\
    .withColumn("seriousnessother",F.col("seriousnessother").cast(IntegerType()))\
    .withColumn("patientonsetage",F.col("patientonsetage").cast(IntegerType()))

In [36]:
# data = data.drop(*["route","brand_name","generic_name","reactionoutcome","drugindication","medicinalproduct","activesubstancename"])

In [37]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
assembler = VectorAssembler(
    inputCols=["op_activesubstancename", "op_medicinalproduct", "op_drugindication","seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage"],
    outputCol="features"
)

data = assembler.transform(data)

In [38]:
# data.select(["op_activesubstancename", "op_medicinalproduct", "op_drugindication","features"]).show()

In [39]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
# labelIndexer = StringIndexer(inputCol="reactionmeddrapt", outputCol="indexedLabel").fit(data)
# indexer_list.append(labelIndexer)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)


In [40]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [41]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="reactionoutcome", featuresCol="indexedFeatures", numTrees=10)

In [49]:
# trainingData.count()

13484

In [42]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
# pipeline = Pipeline(stages=indexer_list)

In [43]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

In [45]:
# Select example rows to display.
predictions.select("prediction", "reactionoutcome", "features").show(5)

+----------+---------------+--------------------+
|prediction|reactionoutcome|            features|
+----------+---------------+--------------------+
|       6.0|              6|[2.0,2.0,2.0,2.0,...|
|       6.0|              6|[2.0,2.0,2.0,2.0,...|
|       6.0|              6|[2.0,2.0,2.0,2.0,...|
|       6.0|              3|[2.0,2.0,2.0,2.0,...|
|       6.0|              6|[2.0,2.0,1.0,2.0,...|
+----------+---------------+--------------------+
only showing top 5 rows



In [46]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="reactionoutcome", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.380564
