In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode, col, arrays_zip
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import pyspark.sql.functions as F
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .appName("data_cleaning") \
    .getOrCreate()

In [23]:
json_data = spark.read.option("multiline","true").json("./Data/JSON/drug-event-0001-of-0005.json")

In [24]:
json_data.printSchema()

root
 |-- meta: struct (nullable = true)
 |    |-- disclaimer: string (nullable = true)
 |    |-- last_updated: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- results: struct (nullable = true)
 |    |    |-- limit: long (nullable = true)
 |    |    |-- skip: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |-- terms: string (nullable = true)
 |-- results: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- companynumb: string (nullable = true)
 |    |    |-- duplicate: string (nullable = true)
 |    |    |-- fulfillexpeditecriteria: string (nullable = true)
 |    |    |-- occurcountry: string (nullable = true)
 |    |    |-- patient: struct (nullable = true)
 |    |    |    |-- drug: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- actiondrug: string (nullable = true)
 |    |    |    |    |    |-- activesubstance: struct (nullab

In [25]:
json_data.show()

+--------------------+--------------------+
|                meta|             results|
+--------------------+--------------------+
|{Do not rely on o...|[{US-PFIZER INC-2...|
+--------------------+--------------------+



In [46]:
json_data.select(explode(F.col("results")).alias("exploded_array")).take(1)

[Row(exploded_array=Row(companynumb='US-PFIZER INC-2007080022', duplicate=None, fulfillexpeditecriteria='2', occurcountry=None, patient=Row(drug=[Row(actiondrug=None, activesubstance=None, drugadditional=None, drugadministrationroute=None, drugauthorizationnumb='020998', drugbatchnumb=None, drugcharacterization='1', drugdosageform=None, drugdosagetext=None, drugenddate=None, drugenddateformat=None, drugindication='ARTHRITIS', drugrecurreadministration=None, drugstartdate='20020101', drugstartdateformat='102', drugstructuredosagenumb=None, drugstructuredosageunit=None, drugtreatmentduration='30', drugtreatmentdurationunit='804', medicinalproduct='CELEBREX', openfda=Row(application_number=['NDA020998'], brand_name=['CELEBREX'], generic_name=['CELECOXIB'], manufacturer_name=['PFIZER LABORATORIES DIV PFIZER INC', 'Viatris Specialty LLC'], nui=['N0000000160', 'M0001335', 'N0000175722'], package_ndc=['0025-1515-01', '0025-1520-31', '0025-1520-51', '0025-1520-34', '0025-1525-31', '0025-1525-5

In [61]:
exploded_results = json_data.select(explode(F.col("results")).alias("exploded_results"))

In [62]:
exploded_results.show(5)

+--------------------+
|    exploded_results|
+--------------------+
|{US-PFIZER INC-20...|
|{US-ABBOTT-04P-16...|
|{USA040977859, NU...|
|{2004235828US, NU...|
|{2004-BP-07617BP(...|
+--------------------+
only showing top 5 rows



In [84]:
exploded_results.take(1)

[Row(exploded_results=Row(companynumb='US-PFIZER INC-2007080022', duplicate=None, fulfillexpeditecriteria='2', occurcountry=None, patient=Row(drug=[Row(actiondrug=None, activesubstance=None, drugadditional=None, drugadministrationroute=None, drugauthorizationnumb='020998', drugbatchnumb=None, drugcharacterization='1', drugdosageform=None, drugdosagetext=None, drugenddate=None, drugenddateformat=None, drugindication='ARTHRITIS', drugrecurreadministration=None, drugstartdate='20020101', drugstartdateformat='102', drugstructuredosagenumb=None, drugstructuredosageunit=None, drugtreatmentduration='30', drugtreatmentdurationunit='804', medicinalproduct='CELEBREX', openfda=Row(application_number=['NDA020998'], brand_name=['CELEBREX'], generic_name=['CELECOXIB'], manufacturer_name=['PFIZER LABORATORIES DIV PFIZER INC', 'Viatris Specialty LLC'], nui=['N0000000160', 'M0001335', 'N0000175722'], package_ndc=['0025-1515-01', '0025-1520-31', '0025-1520-51', '0025-1520-34', '0025-1525-31', '0025-1525

In [63]:
# temp_data.withColumn("keys", F.json_object_keys(temp_data.exploded_array)).show()
keys = exploded_results.select(F.col("exploded_results.*")).columns
keys = ["exploded_results."+str(i) for i in keys]

In [72]:
keys

['exploded_results.companynumb',
 'exploded_results.duplicate',
 'exploded_results.fulfillexpeditecriteria',
 'exploded_results.occurcountry',
 'exploded_results.patient',
 'exploded_results.primarysource',
 'exploded_results.primarysourcecountry',
 'exploded_results.receiptdate',
 'exploded_results.receiptdateformat',
 'exploded_results.receivedate',
 'exploded_results.receivedateformat',
 'exploded_results.receiver',
 'exploded_results.reportduplicate',
 'exploded_results.reporttype',
 'exploded_results.safetyreportid',
 'exploded_results.safetyreportversion',
 'exploded_results.sender',
 'exploded_results.serious',
 'exploded_results.seriousnesscongenitalanomali',
 'exploded_results.seriousnessdeath',
 'exploded_results.seriousnessdisabling',
 'exploded_results.seriousnesshospitalization',
 'exploded_results.seriousnesslifethreatening',
 'exploded_results.seriousnessother',
 'exploded_results.transmissiondate',
 'exploded_results.transmissiondateformat']

In [92]:
patient_keys = exploded_results.select(F.col("exploded_results.patient.*")).columns
patient_keys = ["exploded_results.patient."+str(i) for i in patient_keys]

In [96]:
patient_keys

['exploded_results.patient.drug',
 'exploded_results.patient.patientagegroup',
 'exploded_results.patient.patientdeath',
 'exploded_results.patient.patientonsetage',
 'exploded_results.patient.patientonsetageunit',
 'exploded_results.patient.patientsex',
 'exploded_results.patient.patientweight',
 'exploded_results.patient.reaction',
 'exploded_results.patient.summary']

In [86]:
exploded_results.select(keys[:8]).show()

+--------------------+---------+-----------------------+------------+--------------------+------------------+--------------------+-----------+
|         companynumb|duplicate|fulfillexpeditecriteria|occurcountry|             patient|     primarysource|primarysourcecountry|receiptdate|
+--------------------+---------+-----------------------+------------+--------------------+------------------+--------------------+-----------+
|US-PFIZER INC-200...|     NULL|                      2|        NULL|{[{NULL, NULL, NU...|{5, UNITED STATES}|                NULL|   20040831|
|US-ABBOTT-04P-163...|     NULL|                      2|        NULL|{[{NULL, NULL, NU...|{1, UNITED STATES}|                NULL|   20040916|
|        USA040977859|     NULL|                      2|        NULL|{[{NULL, NULL, NU...|              NULL|                NULL|   20040909|
|        2004235828US|     NULL|                      1|        NULL|{[{NULL, NULL, NU...|         {5, NULL}|                NULL|   20040927|

In [95]:
exploded_results.select(patient_keys).show()

+--------------------+---------------+------------+---------------+-------------------+----------+-------------+--------------------+-------+
|                drug|patientagegroup|patientdeath|patientonsetage|patientonsetageunit|patientsex|patientweight|            reaction|summary|
+--------------------+---------------+------------+---------------+-------------------+----------+-------------+--------------------+-------+
|[{NULL, NULL, NUL...|           NULL|        NULL|           NULL|               NULL|         2|         NULL|[{DRUG INEFFECTIV...|   NULL|
|[{NULL, NULL, NUL...|           NULL|        NULL|             61|                801|         1|         NULL|[{DIZZINESS, NULL...|   NULL|
|[{NULL, NULL, NUL...|           NULL|        NULL|             75|                801|         2|         NULL|[{FEELING ABNORMA...|   NULL|
|[{NULL, NULL, NUL...|           NULL|        NULL|           NULL|               NULL|         2|         NULL|[{ARTHRALGIA, NUL...|   NULL|
|[{NUL