## Data Transformation from JSON to Dataframe

### All Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode, col, arrays_zip
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import pyspark.sql.functions as F
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
import findspark
findspark.init()

### Spark Session

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .appName("data_cleaning") \
    .getOrCreate()

### Read File

In [3]:
json_data = spark.read.option("multiline","true").json("./Data/JSON/drug-event-0008-of-0034.json")

In [4]:
json_data.printSchema()

root
 |-- meta: struct (nullable = true)
 |    |-- disclaimer: string (nullable = true)
 |    |-- last_updated: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- results: struct (nullable = true)
 |    |    |-- limit: long (nullable = true)
 |    |    |-- skip: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |-- terms: string (nullable = true)
 |-- results: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- authoritynumb: string (nullable = true)
 |    |    |-- companynumb: string (nullable = true)
 |    |    |-- duplicate: string (nullable = true)
 |    |    |-- fulfillexpeditecriteria: string (nullable = true)
 |    |    |-- occurcountry: string (nullable = true)
 |    |    |-- patient: struct (nullable = true)
 |    |    |    |-- drug: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- actiondrug: string (nullable = true)
 |    

In [5]:
exploded_results = json_data.select(explode(F.col("results")).alias("exploded_results"))

In [6]:
# exploded_results.count()
#12000

In [7]:
all_keys = []

### Converting Nested JSON Data into Columns

In [8]:
def remove_null_column(data):
    total_rows = 100000
    null_counts = data.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in data.columns])
    null_counts_pandas = null_counts.toPandas()
    list_cols = [i for i in null_counts.columns if null_counts_pandas[i][0]/total_rows > 0.3] #remove any column with 30% null values
    null_counts_pandas[list_cols]
    data = data.drop(*list_cols)
    return data

In [9]:
# temp_data.withColumn("keys", F.json_object_keys(temp_data.exploded_array)).show()
keys = exploded_results.select(F.col("exploded_results.*")).columns
keys = ["exploded_results."+str(i) for i in keys]
all_keys.extend(keys)

In [10]:
patient_keys = exploded_results.select(F.col("exploded_results.patient.*")).columns
patient_keys = ["exploded_results.patient."+str(i) for i in patient_keys]
all_keys.extend(patient_keys)
updated_data = exploded_results.select(all_keys)
updated_data = updated_data.drop(F.col("patient"))
all_keys = updated_data.columns

In [11]:
# updated_data.count()
#12000

In [12]:
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_drug",F.explode(F.col("drug")))
drug_keys = updated_data.select(F.col("explode_drug.*")).columns
drug_keys = ["explode_drug."+i for i in drug_keys]
all_keys.extend(drug_keys)

In [13]:
updated_data = updated_data.select(all_keys)

In [14]:
updated_data = updated_data.drop(*['authoritynumb','duplicate','reportduplicate','patientagegroup','patientweight','summary'])

In [None]:
updated_data.select("drugindication").show()

In [15]:
updated_data = updated_data.na.drop()

In [16]:
# updated_data.select("drugindication").show()

+--------------+
|drugindication|
+--------------+
+--------------+



In [24]:
updated_data.select("explode_drug.drugindication").where(F.col("explode_drug.drugindication") != 'NULL').count()

31784

In [20]:
all_keys = updated_data.columns

In [None]:
updated_data

In [15]:
# updated_data.count()
#54990

In [16]:
all_keys = updated_data.columns
updated_data = updated_data.select(all_keys)\
            .withColumn("explode_reaction",F.explode(F.col("reaction")))
all_keys = updated_data.columns
reaction_keys = updated_data.select(F.col("explode_reaction.*")).columns
reaction_keys = ["explode_reaction."+i for i in reaction_keys]
all_keys.extend(reaction_keys)

In [17]:
updated_data = updated_data.na.drop()

In [18]:
# updated_data = updated_data.dropDuplicates()

In [19]:
updated_data.cache()

DataFrame[companynumb: string, fulfillexpeditecriteria: string, occurcountry: string, primarysource: struct<literaturereference:string,qualification:string,reportercountry:string>, primarysourcecountry: string, receiptdate: string, receiptdateformat: string, receivedate: string, receivedateformat: string, receiver: struct<receiverorganization:string,receivertype:string>, reporttype: string, safetyreportid: string, safetyreportversion: string, sender: struct<senderorganization:string,sendertype:string>, serious: string, seriousnesscongenitalanomali: string, seriousnessdeath: string, seriousnessdisabling: string, seriousnesshospitalization: string, seriousnesslifethreatening: string, seriousnessother: string, transmissiondate: string, transmissiondateformat: string, drug: array<struct<actiondrug:string,activesubstance:struct<activesubstancename:string>,drugadditional:string,drugadministrationroute:string,drugauthorizationnumb:string,drugbatchnumb:string,drugcharacterization:string,drugcu

In [20]:
updated_data.select(all_keys[0]).show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [15]:
updated_data = updated_data.select(all_keys)
all_keys = updated_data.columns
openfda_keys = updated_data.select(F.col("openfda.*")).columns
openfda_keys = ["openfda."+i for i in openfda_keys]
all_keys.extend(openfda_keys)
all_keys.append("activesubstance.activesubstancename")

In [17]:
updated_data = updated_data.withColumn("openfda_{}".format("application_number"),F.explode(F.col('openfda.application_number')))\
            .withColumn("openfda_{}".format("brand_name"),F.explode(F.col('openfda.brand_name')))\
            .withColumn("openfda_{}".format("generic_name"),F.explode(F.col('openfda.generic_name')))\
            .withColumn("openfda_{}".format("manufacturer_name"),F.explode(F.col('openfda.manufacturer_name')))\
            .withColumn("openfda_{}".format("product_type"),F.explode(F.col('openfda.product_type')))\
            .withColumn("openfda_{}".format("substance_name"),F.explode(F.col('openfda.substance_name')))
            # .withColumn("openfda_{}".format("route"),F.explode(F.col('openfda.route')))\
            # .withColumn("openfda_{}".format("rxcui"),F.explode(F.col('openfda.rxcui')))\
            # .withColumn("openfda_{}".format("spl_id"),F.explode(F.col('openfda.spl_id')))\
            # .withColumn("openfda_{}".format("spl_set_id"),F.explode(F.col('openfda.spl_set_id')))\
            # .withColumn("openfda_{}".format("nui"),F.explode(F.col('openfda.nui')))\
            # .withColumn("openfda_{}".format("package_ndc"),F.explode(F.col('openfda.package_ndc')))\
            # .withColumn("openfda_{}".format("pharm_class_cs"),F.explode(F.col('openfda.pharm_class_cs')))\
            # .withColumn("openfda_{}".format("pharm_class_epc"),F.explode(F.col('openfda.pharm_class_epc')))\
            # .withColumn("openfda_{}".format("pharm_class_moa"),F.explode(F.col('openfda.pharm_class_moa')))\
            # .withColumn("openfda_{}".format("pharm_class_pe"),F.explode(F.col('openfda.pharm_class_pe')))\
            # .withColumn("openfda_{}".format("product_ndc"),F.explode(F.col('openfda.product_ndc')))\
            # .withColumn("openfda_{}".format("unii"),F.explode(F.col('openfda.unii'))).show()

In [18]:
updated_data.count()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\Akshay\anaconda3\envs\tf\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
all_keys = updated_data.columns
primarysource_keys = updated_data.select(F.col("primarysource.*")).columns
primarysource_keys = ["primarysource."+i for i in primarysource_keys]
all_keys.extend(primarysource_keys)

In [None]:
updated_data.show()

In [None]:
try:
    updated_data = updated_data.select(all_keys)\
                .withColumn("explode_drugrecurrence",F.explode(F.col("drugrecurrence")))
    all_keys = updated_data.columns
    drugrecurrence_keys = updated_data.select(F.col("explode_drugrecurrence.*")).columns
    drugrecurrence_keys = ["explode_drugrecurrence."+i for i in drugrecurrence_keys]
    all_keys.extend(drugrecurrence_keys)
except Exception as e:
    print(e)

In [None]:
updated_data = updated_data.drop(F.col("primarysource"))
updated_data = updated_data.drop(F.col("explode_drugrecurrence"))
updated_data = updated_data.drop(F.col("drug"))
updated_data = updated_data.drop(F.col("reaction"))
updated_data = updated_data.drop(F.col("receiver"))
updated_data = updated_data.drop(F.col("sender"))
updated_data = updated_data.drop(F.col("summary"))
updated_data = updated_data.drop(F.col("activesubstance"))
updated_data = updated_data.drop(F.col("drugrecurrence"))
updated_data = updated_data.drop(F.col("openfda"))
updated_data = updated_data.drop(F.col("explode_reaction"))

## Data Cleaning

In [None]:
pandas_df = updated_data.limit(20).toPandas()

In [None]:
pandas_df

In [None]:
total_rows = updated_data.count()

### Removing Null Values

In [None]:
updated_data.na.drop().toPandas().to_csv("test1.csv")

In [None]:
# os.environ["HADOOP_HOME"] = "C:/hadoop/hadoop-2.8.3"
# os.environ["PATH"] = "C:/hadoop/hadoop-2.8.3/bin"
# updated_data.write.csv("./test1.csv", mode="overwrite", header=True)

In [None]:
# updated_data.write.parquet("output_parquet_path", mode="overwrite")