In [1]:
import pandas as pd
import time 
start_time = time.time()

In [2]:
# drugs = pd.read_csv('arcos-ca-statewide-itemized.tsv',nrows = 1000000, sep = '\t')
# drugs = pd.read_csv('arcos-ca-statewide-itemized.tsv', sep = '\t')

# Connecting to S3

In [3]:
from pyspark import SparkContext, SparkConf
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell'
!echo $JAVA_HOME
sc = SparkContext.getOrCreate()

# Your AWS S3 Info here




## Exploring the Dataset

In [4]:
# drugs.to_csv('./arcos-ca-statewide-sample.tsv', sep = '\t', index = False)

In [5]:
# drugs = pd.read_csv('../arcos-ca-statewide-sample.tsv', sep = '\t')
# drugs.head()

## Working on sample of Drug Dataset to make a DataFrame

In [6]:
# Agg Sum - CALC_BASE_WT_IN_GM, QUANTITY
# Ingredient_Name has two options: HYDROCODONE BITARTRATE HEMIPENTAHYDRATE or OXYCODONE HYDROCHLORIDE

In [7]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
#from pyspark.sql.functions import concat, col, lit, substring

ss = SparkSession.builder.getOrCreate()

## S3 Bucket
drug_rdd = sc.textFile('../arcos-ca-statewide-sample.tsv').map(lambda x: x.split('\t'))
#drug_rdd = sc.textFile('s3://data-systems-opioid/arcos-ca-statewide-itemized.tsv',24).map(lambda x: x.split('\t'))


def FloatSafe(value): # In case there are non-float type to be converted.
    try:
        return float(value)
    except ValueError:
        return None

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except ValueError:
        return None

# To reduce size, I remove the first 15 col's which are all unusable Identifiers
drug_rdd = drug_rdd.map(lambda x: x[16:25] + x[29:] + [x[11]]).persist()

# Columns I removed:
# -------------------------
# 0'REPORTER_DEA_NO',
#  'REPORTER_BUS_ACT',
#  'REPORTER_NAME',
#  'REPORTER_ADDL_CO_INFO',
#  'REPORTER_ADDRESS1',
#  'REPORTER_ADDRESS2',
#  'REPORTER_CITY',
#  'REPORTER_STATE',
#  'REPORTER_ZIP',
#  'REPORTER_COUNTY'
#10'BUYER_DEA_NO',
#12 'BUYER_NAME',
#  'BUYER_ADDL_CO_INFO',
#  'BUYER_ADDRESS1',
#15'BUYER_ADDRESS2'

# 25 UNIT (0.001% of rows have values)
# 26 Action Indicator
# 27 ORDER_FORM_NO
# 28 CORRECTION_NO

# Takes header row and makes column names
col_names = drug_rdd.first()

# Removes header col
drug_rdd = drug_rdd.filter(lambda x: x != col_names)

# Fixes variable type
drug_rdd = drug_rdd.map(lambda x: [x[0], x[1], x[2], x[3], x[4], IntegerSafe(x[5]), x[6], x[7], FloatSafe(x[8]), FloatSafe(x[9]), IntegerSafe(x[10]), FloatSafe(x[11]), FloatSafe(x[12]), IntegerSafe(x[13]),x[14], x[15], x[16], FloatSafe(x[17]), x[18], x[19], x[20], FloatSafe(x[21]), x[22]])


In [8]:
# To DataFrame
drug_df = drug_rdd.toDF(col_names)

# Set up for the ZIP-YEAR join
drug_df = drug_df.withColumn('Year', substring('TRANSACTION_DATE', -4,4))
drug_df = drug_df.withColumn('ZIP-YEAR', concat(col("BUYER_ZIP"), lit("-"), col("Year")))
drug_df.select("BUYER_ZIP","Year",'ZIP-YEAR').show(10)

+---------+----+----------+
|BUYER_ZIP|Year|  ZIP-YEAR|
+---------+----+----------+
|    93003|2007|93003-2007|
|    92649|2006|92649-2006|
|    92653|2006|92653-2006|
|    92113|2006|92113-2006|
|    92113|2006|92113-2006|
|    91301|2007|91301-2007|
|    92584|2007|92584-2007|
|    93402|2012|93402-2012|
|    93065|2012|93065-2012|
|    95536|2011|95536-2011|
+---------+----+----------+
only showing top 10 rows



### Drug Dataset feature engineering

In [9]:
# Becuase we aggregate everything, we may not have to worry about removing nulls

#converting strings to numeric values
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel, it knows what the unique values are
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

drug_df = indexStringColumns(drug_df, ["BUYER_COUNTY"])

In [10]:
from pyspark.sql.functions import udf,col
# We wrote our own method to One Hot Encode in a more PANDAS way
# This will also allow us to aggregate easier
def ourOneHotEncoder(df, col_name):
    categories = df.select(col_name).distinct().rdd.flatMap(lambda x : x).collect()
    categories.sort()
    for category in categories:
        function = udf(lambda item: 1 if item == category else 0, IntegerType())
        new_column_name = col_name+'_'+ category
        df = df.withColumn(new_column_name, function(col(col_name)))
    return df

drug_df = ourOneHotEncoder(drug_df, "BUYER_BUS_ACT")
drug_df = ourOneHotEncoder(drug_df, "DRUG_NAME")

# Suicide DataFrame

In [11]:
death_rdd = sc.textFile('s3://data-systems-opioid/CA_suicides.csv',24).map(lambda x: x.split(','))

# Takes header row and makes column names
col_names = death_rdd.first()

# Removes header col
death_rdd = death_rdd.filter(lambda x: x != col_names)

# Fix RDD
death_rdd = death_rdd.map(lambda x: [x[0], x[1], x[2], IntegerSafe(x[3]), IntegerSafe(x[4])])

# To SQL DataFrame
death_df = death_rdd.toDF(col_names)
death_df = death_df.withColumn('SUI_per_thousand', death_df['Count']/death_df['Population_2018'] * 1000)

# Set Up for the ZIP-YEAR merg
death_df = death_df.withColumn('ZIP-YEAR', concat(col("ZIP Code"), lit("-"), col("Year")))

## Joining drug_df and death_df

In [12]:
# Change this join based on what flags we add
drug_agg_df = drug_df.groupBy('ZIP-YEAR').agg(
    min('BUYER_COUNTY'), # Should only be one per ZIP
    count('BUYER_CITY'),
    sum('STRENGTH'),
    sum('QUANTITY'),
    sum('CALC_BASE_WT_IN_GM'),
    sum('dos_str'),
    avg('DOSAGE_UNIT'),
    sum('DRUG_NAME_HYDROCODONE'),
    sum('DRUG_NAME_OXYCODONE'),
    sum('BUYER_BUS_ACT_CHAIN PHARMACY'),
    sum('BUYER_BUS_ACT_PRACTITIONER'),
    sum('BUYER_BUS_ACT_PRACTITIONER-DW/100'),
    sum('BUYER_BUS_ACT_PRACTITIONER-DW/275'),
    sum('BUYER_BUS_ACT_PRACTITIONER-DW/30'),
    sum('BUYER_BUS_ACT_RETAIL PHARMACY')
)

drug_agg_df.show(5)

+----------+-----------------+-----------------+-------------+-------------+-----------------------+------------+------------------+--------------------------+------------------------+---------------------------------+-------------------------------+--------------------------------------+--------------------------------------+-------------------------------------+----------------------------------+
|  ZIP-YEAR|min(BUYER_COUNTY)|count(BUYER_CITY)|sum(STRENGTH)|sum(QUANTITY)|sum(CALC_BASE_WT_IN_GM)|sum(dos_str)|  avg(DOSAGE_UNIT)|sum(DRUG_NAME_HYDROCODONE)|sum(DRUG_NAME_OXYCODONE)|sum(BUYER_BUS_ACT_CHAIN PHARMACY)|sum(BUYER_BUS_ACT_PRACTITIONER)|sum(BUYER_BUS_ACT_PRACTITIONER-DW/100)|sum(BUYER_BUS_ACT_PRACTITIONER-DW/275)|sum(BUYER_BUS_ACT_PRACTITIONER-DW/30)|sum(BUYER_BUS_ACT_RETAIL PHARMACY)|
+----------+-----------------+-----------------+-------------+-------------+-----------------------+------------+------------------+--------------------------+------------------------+------------

### How to aggregate each column
- BUYER_COUNTY - min (later string encode)
- BUYER_BUS_ACT - one hot encode sum and divide by zip's count()
- DRUG_NAME	(after encoded) - sum
- count(BUYER_CITY) - do we even need this?
- sum(STRENGTH)
- sum(QUANTITY)
- sum(CALC_BASE_WT_IN_GM)
- sum(dos_str)
- Count - total count
- average (DOSAGE_UNIT)

In [13]:
death_join_df = death_df.select('ZIP-YEAR','Count','SUI_per_thousand')
drug_death_df = drug_agg_df.join(death_join_df, 'ZIP-YEAR', 'left_outer')
drug_death_df.show(10)

+----------+-----------------+-----------------+-------------+-------------+-----------------------+------------------+------------------+--------------------------+------------------------+---------------------------------+-------------------------------+--------------------------------------+--------------------------------------+-------------------------------------+----------------------------------+-----+--------------------+
|  ZIP-YEAR|min(BUYER_COUNTY)|count(BUYER_CITY)|sum(STRENGTH)|sum(QUANTITY)|sum(CALC_BASE_WT_IN_GM)|      sum(dos_str)|  avg(DOSAGE_UNIT)|sum(DRUG_NAME_HYDROCODONE)|sum(DRUG_NAME_OXYCODONE)|sum(BUYER_BUS_ACT_CHAIN PHARMACY)|sum(BUYER_BUS_ACT_PRACTITIONER)|sum(BUYER_BUS_ACT_PRACTITIONER-DW/100)|sum(BUYER_BUS_ACT_PRACTITIONER-DW/275)|sum(BUYER_BUS_ACT_PRACTITIONER-DW/30)|sum(BUYER_BUS_ACT_RETAIL PHARMACY)|Count|    SUI_per_thousand|
+----------+-----------------+-----------------+-------------+-------------+-----------------------+------------------+-----------

In [14]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 79.7850079536438 seconds ---
