In [1]:
import pandas as pd
import time 
start_time = time.time()

In [2]:
# drugs = pd.read_csv('arcos-ca-statewide-itemized.tsv',nrows = 1000000, sep = '\t')
# drugs = pd.read_csv('arcos-ca-statewide-itemized.tsv', sep = '\t')

# Connecting to S3

In [3]:
from pyspark import SparkContext, SparkConf
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell'
!echo $JAVA_HOME
sc = SparkContext.getOrCreate()

# YOUR AMAZON LOGIN INFORMATION HERE




## Exploring the Dataset

In [4]:
# drugs.to_csv('./arcos-ca-statewide-sample.tsv', sep = '\t', index = False)

In [5]:
# drugs = pd.read_csv('./arcos-ca-statewide-sample.tsv', sep = '\t')
# drugs.head()

In [6]:
# Agg Sum - CALC_BASE_WT_IN_GM, QUANTITY
# Ingredient_Name has two options: HYDROCODONE BITARTRATE HEMIPENTAHYDRATE or OXYCODONE HYDROCHLORIDE

## Working on sample of Drug Dataset to make a DataFrame

In [7]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
#from pyspark.sql.functions import concat, col, lit, substring

ss = SparkSession.builder.getOrCreate()

## !!!!! Change this to the S3 Bucket !!!!!!
# drug_rdd = sc.textFile('./arcos-ca-statewide-sample.tsv').map(lambda x: x.split('\t'))
drug_rdd = sc.textFile('s3://data-systems-opioid/arcos-ca-statewide-itemized.tsv',24).map(lambda x: x.split('\t'))


def FloatSafe(value): # In case there are non-float type to be converted.
    try:
        return float(value)
    except ValueError:
        return None

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except ValueError:
        return None

# To reduce size, I remove the first 15 col's which are all unusable Identifiers
drug_rdd = drug_rdd.map(lambda x: x[16:25] + x[29:]).persist()

# Columns I removed:
# -------------------------
# 0'REPORTER_DEA_NO',
#  'REPORTER_BUS_ACT',
#  'REPORTER_NAME',
#  'REPORTER_ADDL_CO_INFO',
#  'REPORTER_ADDRESS1',
#  'REPORTER_ADDRESS2',
#  'REPORTER_CITY',
#  'REPORTER_STATE',
#  'REPORTER_ZIP',
#  'REPORTER_COUNTY'
#  'BUYER_DEA_NO',
#  'BUYER_BUS_ACT',
#  'BUYER_NAME',
#  'BUYER_ADDL_CO_INFO',
#  'BUYER_ADDRESS1',
#15'BUYER_ADDRESS2'

# 25 UNIT (0.001% of rows have values)
# 26 Action Indicator
# 27 ORDER_FORM_NO
# 28 CORRECTION_NO

# Takes header row and makes column names
col_names = drug_rdd.first()

# Removes header col
drug_rdd = drug_rdd.filter(lambda x: x != col_names)

# Fixes variable type
drug_rdd = drug_rdd.map(lambda x: [x[0], x[1], x[2], x[3], x[4], IntegerSafe(x[5]), x[6], x[7], FloatSafe(x[8]), FloatSafe(x[9]), IntegerSafe(x[10]), FloatSafe(x[11]), FloatSafe(x[12]), IntegerSafe(x[13]),x[14], x[15], x[16], FloatSafe(x[17]), x[18], x[19], x[20], FloatSafe(x[21])])


In [8]:
print(col_names)

['BUYER_CITY', 'BUYER_STATE', 'BUYER_ZIP', 'BUYER_COUNTY', 'TRANSACTION_CODE', 'DRUG_CODE', 'NDC_NO', 'DRUG_NAME', 'QUANTITY', 'STRENGTH', 'TRANSACTION_DATE', 'CALC_BASE_WT_IN_GM', 'DOSAGE_UNIT', 'TRANSACTION_ID', 'Product_Name', 'Ingredient_Name', 'Measure', 'MME_Conversion_Factor', 'Combined_Labeler_Name', 'Revised_Company_Name', 'Reporter_family', 'dos_str']


In [9]:
# # This cell is just for Type testing
# row_test = drug_rdd.takeSample(1, 1)[0]
# for i in range(len(row_test)):
#     print(i, col_names[i], row_test[i], type(row_test[i]))

In [10]:
# To DataFrame
drug_df = drug_rdd.toDF(col_names)

# Set up for the ZIP-YEAR join
drug_df = drug_df.withColumn('Year', substring('TRANSACTION_DATE', -4,4))
drug_df = drug_df.withColumn('ZIP-YEAR', concat(col("BUYER_ZIP"), lit("-"), col("Year")))
drug_df.select("BUYER_ZIP","Year",'ZIP-YEAR').show(10)

+---------+----+----------+
|BUYER_ZIP|Year|  ZIP-YEAR|
+---------+----+----------+
|    93003|2007|93003-2007|
|    92649|2006|92649-2006|
|    92653|2006|92653-2006|
|    92113|2006|92113-2006|
|    92113|2006|92113-2006|
|    91301|2007|91301-2007|
|    92584|2007|92584-2007|
|    93402|2012|93402-2012|
|    93065|2012|93065-2012|
|    95536|2011|95536-2011|
+---------+----+----------+
only showing top 10 rows



# Suicide DataFrame

In [11]:
death_rdd = sc.textFile('s3://data-systems-opioid/CA_suicides.csv',24).map(lambda x: x.split(','))

# Takes header row and makes column names
col_names = death_rdd.first()

# Removes header col
death_rdd = death_rdd.filter(lambda x: x != col_names)

# Fix RDD
death_rdd = death_rdd.map(lambda x: [x[0], x[1], x[2], IntegerSafe(x[3]), IntegerSafe(x[4])])

# To SQL DataFrame
death_df = death_rdd.toDF(col_names)
death_df = death_df.withColumn('SUI_per_thousand', death_df['Count']/death_df['Population_2018'] * 1000)

# Set Up for the ZIP-YEAR merg
death_df = death_df.withColumn('ZIP-YEAR', concat(col("ZIP Code"), lit("-"), col("Year")))

## Joining drug_df and death_df

In [12]:
# Change this join based on what flags we add
drug_agg_df = drug_df.groupBy('ZIP-YEAR').agg(count('BUYER_CITY'), sum('STRENGTH'), sum('QUANTITY'), sum('CALC_BASE_WT_IN_GM'), sum('dos_str'))
# drug_agg_df.show(5)

In [13]:
death_join_df = death_df.select('ZIP-YEAR','Count','SUI_per_thousand')
drug_death_df = drug_agg_df.join(death_join_df, 'ZIP-YEAR', 'left_outer')
drug_death_df.show(10)

+----------+-----------------+-------------+-------------+-----------------------+------------------+-----+--------------------+
|  ZIP-YEAR|count(BUYER_CITY)|sum(STRENGTH)|sum(QUANTITY)|sum(CALC_BASE_WT_IN_GM)|      sum(dos_str)|Count|    SUI_per_thousand|
+----------+-----------------+-------------+-------------+-----------------------+------------------+-----+--------------------+
|90026-2007|              957|          0.0|       1627.0|      3555.921771999999|14153.550000000001|    0|                 0.0|
|90031-2009|              357|        800.0|        569.0|      777.7937249999999|            3425.0|    1| 0.02557348541032657|
|90211-2010|             3564|          0.0|      25310.0|     14238.731438450004| 55416.24900000001|    1| 0.12470382840753212|
|90505-2010|             4571|          0.0|     105481.0|     11270.256617024996| 52662.53250000001|    7|  0.1873260543780775|
|90815-2010|             2827|          0.0|       6643.0|         8369.819107575|        29509.8

In [14]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 289.8026008605957 seconds ---
