In [1]:
import collections
import csv
import operator
import os
import sys

from pyspark.sql import SparkSession
from pyspark.sql import functions

sys.path.append('/Users/abuckfire/side-projects/arson')
import static_data.fire_codes.lookup_tables as codes


In [2]:
ARSON_PATH = os.path.join("..", "static_data", "nfirs_arson")
POPULATION_PATH = os.path.join("..", "static_data", "zipcodes", "nst-est2017-popchg2010_2017.csv")

COUNT = "count"
POP_STATE = "NAME"
STATE = "state"
POPULATION_FIELD = "POPESTIMATE"
POPULATION = "population"
YEARS_AVAILABLE = map(str, range(2009, 2015))
FILE_PREFIX = "nfirs_arson_"
DENSITY = "pop_density"
YEAR = "year"
FIELDS = [STATE, COUNT, POPULATION, YEAR, DENSITY]


In [3]:
spark = SparkSession.builder.enableHiveSupport().getOrCreate()


In [4]:
def calculate_arson_density(population_df, year, state_rename_udf):
    arson_df = spark.read.csv(os.path.join(ARSON_PATH, FILE_PREFIX + year + ".csv"), header=True)\
                .select([STATE, "fdid"])\
                .groupBy(STATE).count()

    y1 = year if year != "2009" else "2010" #adjusting for lack of 2009 population data
    population_for_year = population_df.select(POP_STATE, POPULATION_FIELD + y1)\
                                       .withColumnRenamed(POP_STATE, STATE)\
                                       .withColumnRenamed(POPULATION_FIELD + y1, POPULATION)

    joined_data = arson_df.withColumn(STATE, state_rename_udf(arson_df[STATE]))\
                          .join(population_for_year, STATE)\
                          .withColumn("year", functions.lit(year))
    
    return joined_data.withColumn(DENSITY, joined_data[COUNT] * 100000 / joined_data[POPULATION])


In [None]:
def add_density_to_elastic():
    population_df = spark.read.csv(POPULATION_PATH, header=True)
    state_rename_udf = functions.udf(lambda x: codes.STATES[x.strip()] if codes.STATES.get(x) else 0, "string")
    
    densities = calculate_arson_density(population_df, "2009", state_rename_udf)
    print densities.show(5)
