In [120]:
import elastic
elastic.es.indices.delete(index='arson_facts', ignore=[400, 404])
elastic.create_index()

In [121]:
import collections
import csv
import operator
import os
import sys

sys.path.append('/Users/abuckfire/side-projects/arson')
import static_data.fire_codes.lookup_tables as codes
import elastic

In [122]:
COUNT = "count"
POP_STATE = "NAME"
STATE = "state"
POPULATION_FIELD = "POPESTIMATE"
POPULATION = "population"
YEARS_AVAILABLE = map(str, range(2009, 2015))
FILE_PREFIX = "nfirs_arson_"
DENSITY = "pop_density"
YEAR = "year"
FIELDS = [STATE, COUNT, POPULATION, YEAR, DENSITY]

In [130]:
STATES = {"AK": "Alaska", "AL": "Alabama", "AZ": "Arizona", "AR": "Arkansas", "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland", "MA": "Maine", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming"}

In [123]:
def add_to_index(df, fields, index, doc_type):
    densities = df.collect()
    for row in densities:
        elastic.es.index(
            index=index,
            doc_type=doc_type,
            body=dict(zip(fields, [row[field] for field in fields]))
        )



In [127]:
def calculate_arson_density(population_df, year, state_rename_udf):
    arson_df = spark.read.csv(os.path.join(ARSON_PATH, FILE_PREFIX + year + ".csv"), header=True)\
                .select([STATE, "fdid"])\
                .groupBy(STATE).count()

    y1 = year if year != "2009" else "2010" #adjusting for lack of 2009 population data
    population_for_year = population_df.select(POP_STATE, POPULATION_FIELD + y1)\
                                       .withColumnRenamed(POP_STATE, STATE)\
                                       .withColumnRenamed(POPULATION_FIELD + y1, POPULATION)

    joined_data = arson_df.withColumn(STATE, state_rename_udf(arson_df[STATE]))\
                          .join(population_for_year, STATE)\
                          .withColumn("year", functions.lit(year))
    
    return joined_data.withColumn(DENSITY, joined_data[COUNT] * 100000 / joined_data[POPULATION])

In [129]:
def add_density_to_elastic():
    population_df = spark.read.csv(POPULATION_PATH, header=True)
    state_rename_udf = functions.udf(lambda x: STATES[x.strip()] if STATES.get(x) else 0, "string")

    for year in YEARS_AVAILABLE:
        densities = calculate_arson_density(population_df, year, state_rename_udf)
        add_to_index(densities, FIELDS, "arson_facts", "arson_density")


In [131]:
add_density_to_elastic()