## Data Transformation from JSON to Dataframe

### All Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode, col, arrays_zip
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import pyspark.sql.functions as F
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
import findspark
findspark.init()

### Spark Session

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .appName("data_cleaning") \
    .getOrCreate()

### Read File

In [3]:
json_data = spark.read.option("multiline","true").json("./Data/JSON/drug-event-0008-of-0034.json")

In [4]:
json_data.printSchema()

root
 |-- meta: struct (nullable = true)
 |    |-- disclaimer: string (nullable = true)
 |    |-- last_updated: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- results: struct (nullable = true)
 |    |    |-- limit: long (nullable = true)
 |    |    |-- skip: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |-- terms: string (nullable = true)
 |-- results: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- authoritynumb: string (nullable = true)
 |    |    |-- companynumb: string (nullable = true)
 |    |    |-- duplicate: string (nullable = true)
 |    |    |-- fulfillexpeditecriteria: string (nullable = true)
 |    |    |-- occurcountry: string (nullable = true)
 |    |    |-- patient: struct (nullable = true)
 |    |    |    |-- drug: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- actiondrug: string (nullable = true)
 |    

In [5]:
exploded_results = json_data.select(explode(F.col("results")).alias("exploded_results"))

In [6]:
# exploded_results.count()
#12000

In [7]:
all_keys = []

### Converting Nested JSON Data into Columns

In [8]:
def remove_null_column(data):
    total_rows = 100000
    null_counts = data.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in data.columns])
    null_counts_pandas = null_counts.toPandas()
    list_cols = [i for i in null_counts.columns if null_counts_pandas[i][0]/total_rows > 0.3] #remove any column with 30% null values
    null_counts_pandas[list_cols]
    data = data.drop(*list_cols)
    return data

In [9]:
# temp_data.withColumn("keys", F.json_object_keys(temp_data.exploded_array)).show()
keys = exploded_results.select(F.col("exploded_results.*")).columns
keys = ["exploded_results."+str(i) for i in keys]
all_keys.extend(keys)

In [10]:
patient_keys = exploded_results.select(F.col("exploded_results.patient.*")).columns
patient_keys = ["exploded_results.patient."+str(i) for i in patient_keys]
all_keys.extend(patient_keys)
updated_data = exploded_results.select(all_keys)
updated_data = updated_data.drop(F.col("patient"))
all_keys = updated_data.columns

In [11]:
# updated_data.count()
#12000

In [12]:
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_drug",F.explode(F.col("drug")))
drug_keys = updated_data.select(F.col("explode_drug.*")).columns
drug_keys = ["explode_drug."+i for i in drug_keys]
all_keys.extend(drug_keys)

In [13]:
updated_data = updated_data.select(all_keys)

In [14]:
updated_data = updated_data.drop(*['authoritynumb','duplicate','reportduplicate','patientagegroup','patientweight','summary'])

In [15]:
updated_data = updated_data.where(F.col("drugindication") != 'NULL')

In [16]:
updated_data = updated_data.where(F.col("drugindication") != "Product used for unknown indication")

In [17]:
updated_data.show()

+--------------------+-----------------------+------------+--------------------+--------------------+-----------+-----------------+-----------+-----------------+--------+----------+--------------+-------------------+--------------------+-------+----------------------------+----------------+--------------------+--------------------------+--------------------------+----------------+----------------+----------------------+--------------------+---------------+-------------------+----------+--------------------+----------+--------------------+--------------+-----------------------+---------------------+-------------------+--------------------+------------------------+------------------------+--------------------+--------------------+-----------+-----------------+--------------------+----------------------------+--------------------------+-------------------------+--------------+----------------------+-------------+-------------------+-----------------------+-----------------------+---------

In [20]:
all_keys = updated_data.columns

In [22]:
all_keys = updated_data.columns
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_reaction",F.explode(F.col("reaction")))
all_keys = updated_data.columns
reaction_keys = updated_data.select(F.col("explode_reaction.*")).columns
reaction_keys = ["explode_reaction."+i for i in reaction_keys]
all_keys.extend(reaction_keys)

In [23]:
updated_data = updated_data.select(all_keys)

In [24]:
updated_data = updated_data.where(F.col("reactionmeddrapt") != 'NULL')

In [25]:
updated_data = updated_data.select(["seriousnessdeath","seriousnesslifethreatening","seriousnesshospitalization","seriousnessdisabling","seriousnesscongenitalanomali","seriousnessother","patientonsetage","reactionmeddrapt","reactionoutcome","drugindication","activesubstance.activesubstancename","medicinalproduct","openfda.route","openfda.brand_name","openfda.generic_name"])

In [26]:
updated_data.show()

+----------------+--------------------------+--------------------------+--------------------+----------------------------+----------------+---------------+--------------------+---------------+--------------------+--------------------+----------------+-------------+-------------------+--------------+
|seriousnessdeath|seriousnesslifethreatening|seriousnesshospitalization|seriousnessdisabling|seriousnesscongenitalanomali|seriousnessother|patientonsetage|    reactionmeddrapt|reactionoutcome|      drugindication| activesubstancename|medicinalproduct|        route|         brand_name|  generic_name|
+----------------+--------------------------+--------------------------+--------------------+----------------------------+----------------+---------------+--------------------+---------------+--------------------+--------------------+----------------+-------------+-------------------+--------------+
|               2|                         2|                         2|                   2|    

In [30]:
updated_data = updated_data.dropna()

In [51]:
updated_data = updated_data.join(updated_data.groupby(F.col("activesubstancename")).count().filter(F.col("count") > 200).select("activesubstancename"),
                  on="activesubstancename", how="inner")

In [31]:
# updated_data = updated_data.select(all_keys)
# all_keys = updated_data.columns
# openfda_keys = updated_data.select(F.col("openfda.*")).columns
# openfda_keys = ["openfda."+i for i in openfda_keys]
# all_keys.extend(openfda_keys)
# all_keys.append("activesubstance.activesubstancename")

In [33]:
# updated_data = updated_data.withColumn("openfda_{}".format("application_number"),F.explode(F.col('openfda.application_number')))\
#             .withColumn("openfda_{}".format("brand_name"),F.explode(F.col('openfda.brand_name')))\
#             .withColumn("openfda_{}".format("generic_name"),F.explode(F.col('openfda.generic_name')))\
#             .withColumn("openfda_{}".format("manufacturer_name"),F.explode(F.col('openfda.manufacturer_name')))\
#             .withColumn("openfda_{}".format("product_type"),F.explode(F.col('openfda.product_type')))\
#             .withColumn("openfda_{}".format("substance_name"),F.explode(F.col('openfda.substance_name')))
#             # .withColumn("openfda_{}".format("route"),F.explode(F.col('openfda.route')))\
#             # .withColumn("openfda_{}".format("rxcui"),F.explode(F.col('openfda.rxcui')))\
#             # .withColumn("openfda_{}".format("spl_id"),F.explode(F.col('openfda.spl_id')))\
#             # .withColumn("openfda_{}".format("spl_set_id"),F.explode(F.col('openfda.spl_set_id')))\
#             # .withColumn("openfda_{}".format("nui"),F.explode(F.col('openfda.nui')))\
#             # .withColumn("openfda_{}".format("package_ndc"),F.explode(F.col('openfda.package_ndc')))\
#             # .withColumn("openfda_{}".format("pharm_class_cs"),F.explode(F.col('openfda.pharm_class_cs')))\
#             # .withColumn("openfda_{}".format("pharm_class_epc"),F.explode(F.col('openfda.pharm_class_epc')))\
#             # .withColumn("openfda_{}".format("pharm_class_moa"),F.explode(F.col('openfda.pharm_class_moa')))\
#             # .withColumn("openfda_{}".format("pharm_class_pe"),F.explode(F.col('openfda.pharm_class_pe')))\
#             # .withColumn("openfda_{}".format("product_ndc"),F.explode(F.col('openfda.product_ndc')))\
#             # .withColumn("openfda_{}".format("unii"),F.explode(F.col('openfda.unii'))).show()

In [34]:
# updated_data.count()

In [35]:
# all_keys = updated_data.columns
# primarysource_keys = updated_data.select(F.col("primarysource.*")).columns
# primarysource_keys = ["primarysource."+i for i in primarysource_keys]
# all_keys.extend(primarysource_keys)

In [36]:
# updated_data.show()

In [37]:
# try:
#     updated_data = updated_data.select(all_keys)\
#                 .withColumn("explode_drugrecurrence",F.explode(F.col("drugrecurrence")))
#     all_keys = updated_data.columns
#     drugrecurrence_keys = updated_data.select(F.col("explode_drugrecurrence.*")).columns
#     drugrecurrence_keys = ["explode_drugrecurrence."+i for i in drugrecurrence_keys]
#     all_keys.extend(drugrecurrence_keys)
# except Exception as e:
#     print(e)

In [38]:
# updated_data = updated_data.drop(F.col("primarysource"))
# updated_data = updated_data.drop(F.col("explode_drugrecurrence"))
# updated_data = updated_data.drop(F.col("drug"))
# updated_data = updated_data.drop(F.col("reaction"))
# updated_data = updated_data.drop(F.col("receiver"))
# updated_data = updated_data.drop(F.col("sender"))
# updated_data = updated_data.drop(F.col("summary"))
# updated_data = updated_data.drop(F.col("activesubstance"))
# updated_data = updated_data.drop(F.col("drugrecurrence"))
# updated_data = updated_data.drop(F.col("openfda"))
# updated_data = updated_data.drop(F.col("explode_reaction"))

## Data Cleaning

In [52]:
pandas_df = updated_data.toPandas()

In [53]:
pandas_df.dropna()

Unnamed: 0,activesubstancename,seriousnessdeath,seriousnesslifethreatening,seriousnesshospitalization,seriousnessdisabling,seriousnesscongenitalanomali,seriousnessother,patientonsetage,reactionmeddrapt,reactionoutcome,drugindication,medicinalproduct,route,brand_name,generic_name
0,BORTEZOMIB,2,2,2,2,2,1,55,Plasma cell myeloma,6,Plasma cell myeloma,VELCADE,"[INTRAVENOUS, SUBCUTANEOUS]",[VELCADE],[BORTEZOMIB]
1,BORTEZOMIB,2,2,2,2,2,1,55,Intentional product use issue,6,Plasma cell myeloma,VELCADE,"[INTRAVENOUS, SUBCUTANEOUS]",[VELCADE],[BORTEZOMIB]
2,BORTEZOMIB,2,2,1,2,2,1,77,Cholecystitis acute,2,Plasma cell myeloma,BORTEZOMIB,"[INTRAVENOUS, SUBCUTANEOUS]","[BORTEZOMIB, BORUZU, VELCADE]","[BORTEZOMIB, BORTEZOMIB FOR INJECTION, BORTEXO..."
3,BORTEZOMIB,2,2,1,2,2,1,77,Fall,2,Plasma cell myeloma,BORTEZOMIB,"[INTRAVENOUS, SUBCUTANEOUS]","[BORTEZOMIB, BORUZU, VELCADE]","[BORTEZOMIB, BORTEZOMIB FOR INJECTION, BORTEXO..."
4,BORTEZOMIB,2,1,2,2,2,1,53,Myocarditis,1,Plasma cell myeloma,BORTEZOMIB,"[INTRAVENOUS, SUBCUTANEOUS]","[BORTEZOMIB, BORUZU, VELCADE]","[BORTEZOMIB, BORTEZOMIB FOR INJECTION, BORTEXO..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256683,AMOXICILLIN,2,2,1,2,2,1,2,Pyrexia,1,Pneumonia,AMOXICILLIN,[ORAL],[AMOXICILLIN],[AMOXICILLIN]
256684,AMOXICILLIN,2,2,1,2,2,1,2,Oedema peripheral,1,Pneumonia,AMOXICILLIN,[ORAL],[AMOXICILLIN],[AMOXICILLIN]
256685,AMOXICILLIN,2,2,1,2,2,1,2,Cardiac murmur,1,Pneumonia,AMOXICILLIN,[ORAL],[AMOXICILLIN],[AMOXICILLIN]
256686,AMOXICILLIN,2,2,1,2,2,1,2,Condition aggravated,1,Pneumonia,AMOXICILLIN,[ORAL],[AMOXICILLIN],[AMOXICILLIN]


In [57]:
pandas_df["generic_name"].value_counts()

generic_name
[PREDNISONE]                           17451
[INFLIXIMAB]                            8519
[LEFLUNOMIDE]                           8485
[DESOXIMETASONE]                        8045
[METHOTREXATE, METHOTREXATE SODIUM]     7244
                                       ...  
[DIAZEPAM]                                 4
[TOPIRAMATE]                               4
[HEPARIN SODIUM AND DEXTROSE]              3
[IBUPROFEN]                                2
[FUROSEMIDE INJECTION 80 MG/ 10 ML]        1
Name: count, Length: 251, dtype: int64

In [None]:
total_rows = updated_data.count()

### Removing Null Values

In [None]:
updated_data.na.drop().toPandas().to_csv("test1.csv")

In [None]:
# os.environ["HADOOP_HOME"] = "C:/hadoop/hadoop-2.8.3"
# os.environ["PATH"] = "C:/hadoop/hadoop-2.8.3/bin"
# updated_data.write.csv("./test1.csv", mode="overwrite", header=True)

In [None]:
# updated_data.write.parquet("output_parquet_path", mode="overwrite")