In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import collections
import csv
from datetime import datetime as dt
import operator
import os
import sys

from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions

import elastic
from lookup_tables import STATES, SUSPECTED_MOTIVATION_FACTORS, IGNITION_DELAY_DEVICE, PROPERTY_OWNERSHIP, MONTHS


STATE = "state"
POP_STATE = "NAME"
POPULATION_FIELD = "POPESTIMATE"
POPULATION = "population"
COUNT = "count"

UNKNOWNS = ["UU", "NN", None]

YEARS_AVAILABLE = map(str, range(2009, 2015))
FILE_PREFIX = "nfirs_arson_"
ARSON_PATH = os.path.join("..", "static_data", "nfirs_arson")

FIELDS = {"motives": [STATE, "motive", "year"],
          "method": [STATE, "method", "year"],
          "ownership": [STATE, "ownership", "year"],
          "monthly_counts": [STATE, "month", "count", "year"],
          "arson_density" : [STATE, "count", "population", "year", "pop_density"]
        }


def parse_date_by_month(date_string):
    if "-" not in date_string:
        return MONTHS[date_string[:2] if date_string[:2] in MONTHS else date_string[0]]
    return MONTHS[str(dt.strptime(date_string, "%Y-%m-%dT%H:%M:%S").month)]

    
def get_arson_per_month(df, column="inc_date"):
    parse_date_by_month_udf = functions.udf(parse_date_by_month, "string")
    date_as_month_df = df.withColumn(column, parse_date_by_month_udf(df[column]))
    by_state = date_as_month_df.groupBy([column, STATE]).count()
    overall = date_as_month_df.groupBy(column).count().withColumn(STATE, functions.lit("overall")).select(column, STATE, COUNT)
    return overall.union(by_state)

def ingest_facts_into_es():
    # Initializing Spark Context and reading data in
    spark = SparkSession.builder.enableHiveSupport().getOrCreate()

    state_rename_udf = functions.udf(lambda x: STATES[x.strip()] if STATES.get(x) else 0, "string")
    dfs = []

    for year in YEARS_AVAILABLE:
        df = spark.read.csv(os.path.join(ARSON_PATH, FILE_PREFIX + year + ".csv"), header=True)
        dfs.append(get_arson_per_month(df))

    return dfs

In [2]:
dfs = ingest_facts_into_es()


In [3]:
def aggregate_overall(dfs, years=["2009", "2010", "2011", "2012", "2013", "2014"]):
    by_months = None
    for df, year in zip(dfs, years):
        a = df[df.state=="overall"].groupBy("inc_date").sum().withColumnRenamed('sum(count)', year)
        if by_months:
            by_months = by_months.join(a, ["inc_date"], "outer")
        else:
            by_months = a
        
    return by_months

In [5]:
counts = aggregate_overall(dfs)

In [None]:
counts.show()