In [1]:
import pandas as pd
import time 
start_time = time.time()

In [2]:
# drugs = pd.read_csv('arcos-ca-statewide-itemized.tsv',nrows = 1000000, sep = '\t')
# drugs = pd.read_csv('arcos-ca-statewide-itemized.tsv', sep = '\t')

# Connecting to S3

In [3]:
from pyspark import SparkContext, SparkConf
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell'
!echo $JAVA_HOME
sc = SparkContext.getOrCreate()

sc._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", "AKIA4A3YLKCTSGOUA35B")
sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', 'vTOLeXCpFYGCGdB/fMyjtFklh+a86tiLm1oV2j8U')




## Exploring the Dataset

In [4]:
# drugs = pd.read_csv('../arcos-ca-statewide-sample.tsv', sep = '\t')
# drugs.head()

## Working on sample of Drug Dataset to make a DataFrame

In [5]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
#from pyspark.sql.functions import concat, col, lit, substring

ss = SparkSession.builder.getOrCreate()

## S3 Bucket
drug_rdd = sc.textFile('../arcos-ca-statewide-sample.tsv').map(lambda x: x.split('\t'))
#drug_rdd = sc.textFile('s3://data-systems-opioid/arcos-ca-statewide-itemized.tsv',24).map(lambda x: x.split('\t'))


def FloatSafe(value): # In case there are non-float type to be converted.
    try:
        return float(value)
    except ValueError:
        return None

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except ValueError:
        return None

# To reduce size, I remove the first 15 col's which are all unusable Identifiers
drug_rdd = drug_rdd.map(lambda x: x[16:25] + x[29:] + [x[11]]).persist()

# Columns I removed:
# -------------------------
# 0'REPORTER_DEA_NO',
#  'REPORTER_BUS_ACT',
#  'REPORTER_NAME',
#  'REPORTER_ADDL_CO_INFO',
#  'REPORTER_ADDRESS1',
#  'REPORTER_ADDRESS2',
#  'REPORTER_CITY',
#  'REPORTER_STATE',
#  'REPORTER_ZIP',
#  'REPORTER_COUNTY'
#10'BUYER_DEA_NO',
#12 'BUYER_NAME',
#  'BUYER_ADDL_CO_INFO',
#  'BUYER_ADDRESS1',
#15'BUYER_ADDRESS2'

# 25 UNIT (0.001% of rows have values)
# 26 Action Indicator
# 27 ORDER_FORM_NO
# 28 CORRECTION_NO

# Takes header row and makes column names
col_names = drug_rdd.first()

# Removes header col
drug_rdd = drug_rdd.filter(lambda x: x != col_names)

# Fixes variable type
drug_rdd = drug_rdd.map(lambda x: [x[0], x[1], x[2], x[3], x[4], IntegerSafe(x[5]), x[6], x[7], FloatSafe(x[8]), FloatSafe(x[9]), IntegerSafe(x[10]), FloatSafe(x[11]), FloatSafe(x[12]), IntegerSafe(x[13]),x[14], x[15], x[16], FloatSafe(x[17]), x[18], x[19], x[20], FloatSafe(x[21]), x[22]])


In [6]:
print(col_names)

['BUYER_CITY', 'BUYER_STATE', 'BUYER_ZIP', 'BUYER_COUNTY', 'TRANSACTION_CODE', 'DRUG_CODE', 'NDC_NO', 'DRUG_NAME', 'QUANTITY', 'STRENGTH', 'TRANSACTION_DATE', 'CALC_BASE_WT_IN_GM', 'DOSAGE_UNIT', 'TRANSACTION_ID', 'Product_Name', 'Ingredient_Name', 'Measure', 'MME_Conversion_Factor', 'Combined_Labeler_Name', 'Revised_Company_Name', 'Reporter_family', 'dos_str', 'BUYER_BUS_ACT']


In [7]:
# # This cell is just for Type testing
# row_test = drug_rdd.takeSample(1, 1)[0]
# for i in range(len(row_test)):
#     print(i, col_names[i], row_test[i], type(row_test[i]))

In [8]:
# To DataFrame
drug_df = drug_rdd.toDF(col_names)

# Set up for the ZIP-YEAR join
drug_df = drug_df.withColumn('Year', substring('TRANSACTION_DATE', -4,4))
drug_df = drug_df.withColumn('ZIP-YEAR', concat(col("BUYER_ZIP"), lit("-"), col("Year")))
drug_df.select("BUYER_ZIP","Year",'ZIP-YEAR').show(10)

+---------+----+----------+
|BUYER_ZIP|Year|  ZIP-YEAR|
+---------+----+----------+
|    93003|2007|93003-2007|
|    92649|2006|92649-2006|
|    92653|2006|92653-2006|
|    92113|2006|92113-2006|
|    92113|2006|92113-2006|
|    91301|2007|91301-2007|
|    92584|2007|92584-2007|
|    93402|2012|93402-2012|
|    93065|2012|93065-2012|
|    95536|2011|95536-2011|
+---------+----+----------+
only showing top 10 rows



### Drug Dataset feature engineering

In [9]:
# Becuase we aggregate everything, we may not have to worry about removing nulls

#converting strings to numeric values
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel, it knows what the unique values are
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

drug_df = indexStringColumns(drug_df, ["BUYER_COUNTY", "Revised_Company_Name","DRUG_NAME", "BUYER_BUS_ACT"])

In [10]:
# One hot encoding
from pyspark.ml.feature import OneHotEncoder
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

drug_df = oneHotEncodeColumns(drug_df, ["DRUG_NAME", "BUYER_BUS_ACT"])

In [11]:
drug_df.select("BUYER_COUNTY", "Revised_Company_Name","DRUG_NAME", "BUYER_BUS_ACT").show()

+------------+--------------------+-------------+-------------+
|BUYER_COUNTY|Revised_Company_Name|    DRUG_NAME|BUYER_BUS_ACT|
+------------+--------------------+-------------+-------------+
|        11.0|                56.0|(2,[0],[1.0])|(6,[2],[1.0])|
|         1.0|                 1.0|(2,[0],[1.0])|(6,[1],[1.0])|
|         1.0|                 6.0|(2,[0],[1.0])|(6,[1],[1.0])|
|         2.0|                 2.0|(2,[0],[1.0])|(6,[1],[1.0])|
|         2.0|                 0.0|(2,[0],[1.0])|(6,[1],[1.0])|
|         0.0|                 0.0|(2,[0],[1.0])|(6,[1],[1.0])|
|         3.0|                 3.0|(2,[1],[1.0])|(6,[1],[1.0])|
|        22.0|                 2.0|(2,[0],[1.0])|(6,[1],[1.0])|
|        11.0|                 2.0|(2,[0],[1.0])|(6,[1],[1.0])|
|        28.0|                 0.0|(2,[0],[1.0])|(6,[1],[1.0])|
|        28.0|                 0.0|(2,[0],[1.0])|(6,[1],[1.0])|
|        28.0|                 0.0|(2,[0],[1.0])|(6,[1],[1.0])|
|        28.0|                 0.0|(2,[0

In [12]:
drug_df.select("BUYER_BUS_ACT").first()[0]

SparseVector(6, {2: 1.0})

# Suicide DataFrame

In [13]:
death_rdd = sc.textFile('s3://data-systems-opioid/CA_suicides.csv',24).map(lambda x: x.split(','))

# Takes header row and makes column names
col_names = death_rdd.first()

# Removes header col
death_rdd = death_rdd.filter(lambda x: x != col_names)

# Fix RDD
death_rdd = death_rdd.map(lambda x: [x[0], x[1], x[2], IntegerSafe(x[3]), IntegerSafe(x[4])])

# To SQL DataFrame
death_df = death_rdd.toDF(col_names)
death_df = death_df.withColumn('SUI_per_thousand', death_df['Count']/death_df['Population_2018'] * 1000)

# Set Up for the ZIP-YEAR merg
death_df = death_df.withColumn('ZIP-YEAR', concat(col("ZIP Code"), lit("-"), col("Year")))

## Joining drug_df and death_df

In [14]:
# Change this join based on what flags we add
drug_agg_df = drug_df.groupBy('ZIP-YEAR').agg(
    min('BUYER_COUNTY'), # Should only be one per ZIP
    count('BUYER_CITY'),
    sum('STRENGTH'),
    sum('QUANTITY'),
    sum('CALC_BASE_WT_IN_GM'),
    sum('dos_str'),
    avg('DOSAGE_UNIT'),
    # sum('DRUG_NAME') NOT ABLE TO aggregate due to index issue!
)
drug_agg_df.show(5)

+----------+-----------------+-----------------+-------------+-------------+-----------------------+------------+------------------+
|  ZIP-YEAR|min(BUYER_COUNTY)|count(BUYER_CITY)|sum(STRENGTH)|sum(QUANTITY)|sum(CALC_BASE_WT_IN_GM)|sum(dos_str)|  avg(DOSAGE_UNIT)|
+----------+-----------------+-----------------+-------------+-------------+-----------------------+------------+------------------+
|93313-2007|              9.0|              218|          0.0|        529.0|             559.139049|     2042.85|510.55045871559633|
|91790-2008|              0.0|              193|          0.0|        841.0|      908.5731599999999|      2452.5|487.04663212435236|
|92008-2007|              2.0|              126|          0.0|        387.0|              383.30595|      1370.0|  513.968253968254|
|92833-2007|              1.0|              132|          0.0|        210.0|               325.9797|      1575.0|377.27272727272725|
|91910-2012|              2.0|              416|      28000.0|       

In [15]:
death_join_df = death_df.select('ZIP-YEAR','Count','SUI_per_thousand')
drug_death_df = drug_agg_df.join(death_join_df, 'ZIP-YEAR', 'left_outer')
drug_death_df.show(10)

+----------+-----------------+-----------------+-------------+-------------+-----------------------+------------------+------------------+-----+--------------------+
|  ZIP-YEAR|min(BUYER_COUNTY)|count(BUYER_CITY)|sum(STRENGTH)|sum(QUANTITY)|sum(CALC_BASE_WT_IN_GM)|      sum(dos_str)|  avg(DOSAGE_UNIT)|Count|    SUI_per_thousand|
+----------+-----------------+-----------------+-------------+-------------+-----------------------+------------------+------------------+-----+--------------------+
|90026-2007|              0.0|               88|          0.0|        142.0|     271.17572900000005|           1172.85| 341.3636363636364|    0|                 0.0|
|90031-2009|              0.0|               27|        100.0|         41.0|     50.655849999999994|             215.0| 470.3703703703704|    1| 0.02557348541032657|
|90211-2010|              0.0|              224|          0.0|       2917.0|      713.4047425750001|         3209.8355| 363.8482142857143|    1| 0.12470382840753212|
|905

In [16]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 70.15643072128296 seconds ---
