## Setting up Environment
Set up environment for querying

### 1. Importing packages

In [176]:
import collections
import csv
import operator
import os
import sys

from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions


In [2]:
spark

### 2. Setting python path for data and setting data paths

In [144]:
sys.path.append('/Users/abuckfire/side-projects/arson')
import static_data.fire_codes.lookup_tables as codes

In [22]:
years_available = map(str, range(2009, 2015))
file_prefix = "nfirs_arson_"
arson_data_path = os.path.join("..", "static_data", "nfirs_arson")
fire_depts_data_path = os.path.join("..", "static_data", "nfirs_fire_depts")
pop_by_zipcode_csv = os.path.join("..", "static_data", "zipcodes", "pop_per_zip.csv")

### 3. Initializing Spark Context and reading data in

In [5]:
sqlContext = SQLContext(sc)

In [6]:
spark = SparkSession.builder.enableHiveSupport().getOrCreate()

In [26]:
depts = spark.read.csv(fire_depts_data_path + "/nfirs_arson_" + "2009.csv", header=True).select(["state", "fdid"])
arson_df = spark.read.csv(arson_data_path + "/nfirs_arson_" + "2009.csv", header=True)

In [236]:
joined_on_zip = arson.join(depts, ['state','fdid'])

## Questions

In [237]:
COUNT = "count"
STATE = "state"
UNKNOWNS = ["UU", "NN", None]

YEARS_AVAILABLE = map(str, range(2009, 2015))
FILE_PREFIX = "nfirs_arson_"
ARSON_PATH = os.path.join("..", "static_data", "nfirs_arson")
DEPTS_PATH = os.path.join("..", "static_data", "nfirs_fire_depts")
ZIPCODES_PATH = os.path.join("..", "static_data", "zipcodes", "pop_per_zip.csv")


def lookup_code(candidate_list, column, codebook):
    candidates = [row for row in candidate_list if row[column] not in UNKNOWNS]
    if candidates:
        return codebook[max(candidates, key=operator.itemgetter(1))[0]]
    return None

def build_max_count_dicts(df, column, codebook, state_codes=STATES):
    results = collections.defaultdict()
    for state in state_codes:
        count_list = df[df.state==state].select(column, COUNT).collect()
        results[state] = lookup_code(count_list, column, codebook)

    overall_res_df = df.groupBy(column).agg(functions.sum(COUNT))
    results["overall"] = lookup_code(overall_res_df, column, codebook)
    return results


In [246]:
def get_motives(df, column="motivation"):
    df1 = df.groupBy(["mot_facts1", STATE]).count().withColumnRenamed("mot_facts1", column).withColumnRenamed(COUNT, "count_1")
    df2 = df.groupBy(["mot_facts2", STATE]).count().withColumnRenamed("mot_facts2", column).withColumnRenamed(COUNT, "count_2")
    df3 = df.groupBy(["mot_facts3", STATE]).count().withColumnRenamed("mot_facts3", column).withColumnRenamed(COUNT, "count_3")

    motives = df1.join(df2, [column, STATE], "outer").join(df3, [column, STATE], "outer").na.fill(0)

    motives_per_state = motives.withColumn(COUNT, sum(motives[col] for col in ["count_1", "count_2", "count_3"]))
    motives_per_state.show()
    return motives_per_state
    #return build_max_count_dicts(motives_per_state, column, codes.SUSPECTED_MOTIVATION_FACTORS)


In [247]:
arson_df = spark.read.csv(arson_data_path + "/nfirs_arson_" + "2009.csv", header=True)


In [248]:
motives_per_state = get_motives(arson_df)

+----------+-----+-------+-------+-------+-----+
|motivation|state|count_1|count_2|count_3|count|
+----------+-----+-------+-------+-------+-----+
|        13|   NJ|      1|      0|      0|    1|
|        43|   MT|      1|      0|      0|    1|
|        00|   MO|      1|      0|      0|    1|
|        14|   NJ|      3|      1|      0|    4|
|        21|   IN|      8|      3|      0|   11|
|        23|   NY|      1|      0|      0|    1|
|        13|   CA|      7|      2|      0|    9|
|        13|   DC|      2|      0|      0|    2|
|        63|   HI|      2|      0|      0|    2|
|        00|   NE|      7|      2|      0|    9|
|        12|   VA|      0|      1|      0|    1|
|        22|   MN|      1|      0|      0|    1|
|        44|   TX|      2|      1|      0|    3|
|        54|   PA|      2|      0|      0|    2|
|      null|   AZ|    508|      0|      0|  508|
|      null|   AZ|      0|    556|      0|  556|
|      null|   SC|    344|      0|      0|  344|
|      null|   SC|  

### Question 1: What is the most common motivation for starting a fire?

In [113]:
df1 = arson_df.groupBy(["mot_facts1", "state"]).count().withColumnRenamed("mot_facts1", "motivation").withColumnRenamed("count", "count_1")
df2 = arson_df.groupBy(["mot_facts2", "state"]).count().withColumnRenamed("mot_facts2", "motivation").withColumnRenamed("count", "count_2")
df3 = arson_df.groupBy(["mot_facts3", "state"]).count().withColumnRenamed("mot_facts3", "motivation").withColumnRenamed("count", "count_3")

motives = df1.join(df2, ["motivation", "state"], "outer").join(df3, ["motivation", "state"], "outer").na.fill(0)

In [198]:
motives_per_state = motives.withColumn("count", sum(motives[col] for col in ["count_1", "count_2", "count_3"]))

+----------+-----+-------+-------+-------+-----+
|motivation|state|count_1|count_2|count_3|count|
+----------+-----+-------+-------+-------+-----+
|        13|   NJ|      1|      0|      0|    1|
|        43|   MT|      1|      0|      0|    1|
|        00|   MO|      1|      0|      0|    1|
|        14|   NJ|      3|      1|      0|    4|
|        21|   IN|      8|      3|      0|   11|
|        23|   NY|      1|      0|      0|    1|
|        13|   CA|      7|      2|      0|    9|
|        13|   DC|      2|      0|      0|    2|
|        63|   HI|      2|      0|      0|    2|
|        00|   NE|      7|      2|      0|    9|
|        12|   VA|      0|      1|      0|    1|
|        22|   MN|      1|      0|      0|    1|
|        44|   TX|      2|      1|      0|    3|
|        54|   PA|      2|      0|      0|    2|
|      null|   AZ|    508|      0|      0|  508|
|      null|   AZ|      0|    556|      0|  556|
|      null|   SC|    344|      0|      0|  344|
|      null|   SC|  

In [231]:
COUNT = "count"
STATE = "state"
UNKNOWNS = ["UU", "NN", None]

def lookup_code(candidate_list, column, codebook):
    candidates = [row for row in candidate_list if row[column] not in UNKNOWNS]
    if candidates:
        return codebook[max(candidates, key=operator.itemgetter(1))[0]]
    return None

def build_max_count_dicts(df, column, codebook, state_codes=STATES):
    results = collections.defaultdict()
    for state in state_codes:
        count_list = df[df.state==state].select(column, COUNT).collect()
        results[state] = lookup_code(count_list, column, codebook)

    overall_res_df = df.groupBy(column).agg(functions.sum(COUNT))
    results["overall"] = lookup_code(overall_res_df, column, codebook)
    return results

In [232]:
STATES = ["AK", "AL", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [233]:
def get_motives(df, column="motivation"):
    df1 = df.groupBy(["mot_facts1", STATE]).count().withColumnRenamed("mot_facts1", column).withColumnRenamed(COUNT, "count_1")
    df2 = df.groupBy(["mot_facts2", STATE]).count().withColumnRenamed("mot_facts2", column).withColumnRenamed(COUNT, "count_2")
    df3 = df.groupBy(["mot_facts3", STATE]).count().withColumnRenamed("mot_facts3", column).withColumnRenamed(COUNT, "count_3")

    all_motives = df1.join(df2, [column, STATE], "outer").join(df3, [column, STATE], "outer").na.fill(0)

    motives_per_state = motives.withColumn(COUNT, sum(motives[col] for col in ["count_1", "count_2", "count_3"]))
    return build_max_count_dicts(motives_per_state, column, codes.SUSPECTED_MOTIVATION_FACTORS, STATES)


In [181]:
def get_max_value_2(df, column, codebook, states):
    results = collections.defaultdict()
    for state in states:
        count_list = df[df.state==state].select(column, "count").collect()
        candidates = [row for row in count_list if row[column] not in ["UU", "NN", None]]
        if candidates:
            results[state] = codebook[max(candidates, key=operator.itemgetter(1))[0]]
        else:
            results[state] = None
    return results

In [234]:
r = get_motives(arson_df)

ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.

In [205]:
results = get_max_value_2(motives_per_state, "motivation", codes.SUSPECTED_MOTIVATION_FACTORS, STATES)

In [207]:
motives_overall = motives_per_state.groupBy("motivation").agg(functions.sum("count"))
biggest_motive = get_max_value(motives_overall, "motivation", codes.SUSPECTED_MOTIVATION_FACTORS)
results["overall"] = biggest_motive

### Question 2: What are the most common materials used for starting a fire?

In [214]:
container = arson_df.groupBy(["state", "devi_cont"]).count()
spark_type = arson_df.groupBy(["state", "devi_ignit"]).count()
gas_type = arson_df.groupBy(["state", "devi_fuel"]).count()

In [30]:
def get_max_value(df, column, codebook):
    count_list = df.collect()
    candidates = [row for row in count_list if row[column] not in ["UU", "NN", None]]
    return codebook[max(candidates, key=operator.itemgetter(1))[0]]

In [211]:
def get_max_value_3(df, column, codebook, states):
    results = collections.defaultdict()
    for state in states:
        count_list = df[df.state==state].select(column, "count").collect()
        candidates = [row for row in count_list if row[column] not in ["UU", "NN", None]]
        if candidates:
            results[state] = codebook[max(candidates, key=operator.itemgetter(1))[0]]
        else:
            results[state] = None
    return results

In [215]:
container_results = get_max_value_3(container, "devi_cont", codes.INCENDIARY_DEVICES, STATES)

In [220]:
motives_overall_c = container.groupBy("devi_cont").agg(functions.sum("count"))
biggest_motive_c = get_max_value(motives_overall_c, "devi_cont", codes.INCENDIARY_DEVICES)
container_results["overall"] = biggest_motive_c

In [222]:
container_results

defaultdict(None,
            {'AK': 'gasoline or fuel can',
             'AL': 'plastic bottle',
             'AR': 'gasoline or fuel can',
             'AZ': 'can (not gas or fuel)',
             'CA': 'gasoline or fuel can',
             'CO': 'other container',
             'CT': 'gasoline or fuel can',
             'DE': 'gasoline or fuel can',
             'FL': 'gasoline or fuel can',
             'GA': 'gasoline or fuel can',
             'HI': 'pressurized container',
             'IA': 'gasoline or fuel can',
             'ID': 'gasoline or fuel can',
             'IL': 'gasoline or fuel can',
             'IN': 'gasoline or fuel can',
             'KS': 'gasoline or fuel can',
             'KY': 'plastic bottle',
             'LA': 'gasoline or fuel can',
             'MA': 'gasoline or fuel can',
             'MD': 'glass bottle',
             'ME': 'gasoline or fuel can',
             'MI': 'glass bottle',
             'MN': 'gasoline or fuel can',
             'MO': 'glas

### Question 3: What is the most common type of property burned?

In [266]:
property_ownership = arson_df.groupBy("prop_owner").count()
property_ownership.show()

+----------+-----+
|prop_owner|count|
+----------+-----+
|         7|    2|
|         3|  128|
|         0|  243|
|      null|47203|
|         5|   25|
|         1| 7757|
|         4|   77|
|         2|  500|
+----------+-----+



In [267]:
print("The primary property type of property affected by arson is: {}".format(
    get_max_value(property_ownership, "prop_owner", codes.PROPERTY_OWNERSHIP)))

The primary property type of property affected by arson is: private


### Question 4: Number of Arsons per Month

In [25]:
date = "inc_date"


In [26]:
def parse_date_by_month(date_string):
    month_lookup = {
        "1": "January",
        "2": "February",
        "3": "March",
        "4": "April",
        "5": "May",
        "6": "June",
        "7": "July",
        "8": "August",
        "9": "September",
        "10": "October",
        "11": "November",
        "12": "December"
    }
    return month_lookup[date_string[:2] if date_string[:2] in month_lookup else date_string[0]]

parse_date_by_month_udf = functions.udf(parse_date_by_month, "string")

In [27]:
date_as_month_df = arson_df.withColumn(date, parse_date_by_month_udf(arson_df[date]))


In [28]:
arson_by_month = date_as_month_df.groupBy(date).count()