In [17]:
import pandas as pd
import os

pd.set_option("display.max_columns", 25)
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_colwidth", 5000)

DATSET_PATH = os.path.join(os.path.abspath("."), "datasets/911/911_metadata.csv")
AGG_CAT = os.path.join(os.path.abspath("."), "datasets/911/agg_cat.json")
AGG_CLUST = os.path.join(os.path.abspath("."), "datasets/911/agg_clust.json")

In [18]:
%time
df = pd.read_csv(DATSET_PATH)

head = df.head()
description = df.describe()

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 5.25 μs


In [19]:
print(head)
print(description)

   id  event_id  \
0   0         2   
1   1         8   
2   2         9   
3   3        10   
4   4        11   

                                                                                                                  link  \
0             https://web.archive.org/web/20150417085342/http://mp3.911dispatch.com.s3.amazonaws.com/detroit_911_1.mp3   
1      https://web.archive.org/web/20150417085342/http://mp3.911dispatch.com.s3.amazonaws.com/hernlen_choufani_911.mp3   
2               https://web.archive.org/web/20150417085342/http://mp3.911dispatch.com.s3.amazonaws.com/watauga_911.mp3   
3  https://web.archive.org/web/20150417085342/http://mp3.911dispatch.com.s3.amazonaws.com/wamsley_to_douglascounty.mp3   
4             https://web.archive.org/web/20150417085342/http://mp3.911dispatch.com.s3.amazonaws.com/call_for_date.mp3   

                              title  date     state  deaths  potential_death  \
0  Detroit Child’s 911 Call – audio  2/06  Michigan     1.0             

In [20]:
%time
columns = df.columns
df_len = len(df)
unique_titles = df["title"].unique()

CPU times: user 2 μs, sys: 1e+03 ns, total: 3 μs
Wall time: 4.53 μs


In [21]:
print("Columns:\n", columns)
print("Len: ", df_len)
print("Titles:\n", unique_titles, len(unique_titles))

Columns:
 Index(['id', 'event_id', 'link', 'title', 'date', 'state', 'deaths',
       'potential_death', 'false_alarm', 'description', 'deaths_binary',
       'break', 'filename'],
      dtype='object')
Len:  710
Titles:
 ['Detroit Child’s 911 Call – audio' 'Girl’s Murder 911 Call'
 '‘Shoot Her?’ 911 call' 'Snowstorm 911 Call' '911 Call for a Date'
 'Murder 911 Call' 'Jon Benet 911 Call' 'Murder 911 call – Wash.'
 'Murder 911 Call – SC' 'Maryland fire 911' 'Baby not breathing'
 'Crash 911 Call – Colo.' 'Fire rescue 911 call-Md.' 'Carwash 911 – Mich.'
 'Wisc. Nanny 911 call' 'Officer’s MJ call – Mich.'
 'Columbus drowning 911' 'Courthouse shooting' 'River Phoenix 911 call'
 'Wheelchair ride – Mich.' 'Festival crash – DC' 'LA hospital 911 calls'
 'Murder-suicide – Wisc.' 'Stabbing 911' 'Murder 911' 'Pool save – Colo.'
 'Stabbing attack 911' 'Non-emerg 911’s' 'Murder 911 call'
 'Pellet gun shooting – Wisc.' '911 from closet – Fla.'
 'Baby Delivery 911 – Tex.' 'Double murder – Ken.'
 'Chil

In [22]:
%time
from sklearn.cluster import DBSCAN

proc_df = df[df["false_alarm"] == 0][
    ["title", "deaths", "potential_death", "false_alarm"]
].fillna(0)

dbscan = DBSCAN()

res = dbscan.fit_predict(proc_df[["deaths", "potential_death", "false_alarm"]])
proc_df["clustered"] = dbscan.labels_

CPU times: user 1e+03 ns, sys: 1 μs, total: 2 μs
Wall time: 4.53 μs


In [23]:
for v in proc_df["clustered"].unique():
    print(proc_df[proc_df["clustered"] == v])
    print("\n\n")

                                title  deaths  potential_death  false_alarm  \
0    Detroit Child’s 911 Call – audio     1.0              1.0          0.0   
6                  Jon Benet 911 Call     1.0              1.0          0.0   
8                Murder 911 Call – SC     1.0              1.0          0.0   
16              Columbus drowning 911     1.0              1.0          0.0   
18             River Phoenix 911 call     1.0              1.0          0.0   
21              LA hospital 911 calls     1.0              1.0          0.0   
23                       Stabbing 911     1.0              1.0          0.0   
24                         Murder 911     1.0              1.0          0.0   
33                    Murder 911 call     1.0              1.0          0.0   
38             Child abduction (Ohio)     1.0              1.0          0.0   
40                       Missing baby     1.0              1.0          0.0   
42               CPR by phone – Tenn.     1.0       

In [24]:
%time
most_fre_incedents = [
    'shooting', 'murder', 'burglary',
    'fire', 'crash', 'murder-suicide', 
    'stabbing', 'robbery', 'invasion',
]


def apply_func(row):
    for inc in most_fre_incedents:
        if row.find(inc) != -1:
            return inc


proc_df["category"] = proc_df["title"].apply(apply_func)
proc_df = proc_df.dropna()

CPU times: user 2 μs, sys: 1 μs, total: 3 μs
Wall time: 5.96 μs


In [9]:
proc_df

Unnamed: 0,title,deaths,potential_death,false_alarm,clustered,category
9,Maryland fire 911,5.0,1.0,0.0,2,fire
17,Courthouse shooting,2.0,1.0,0.0,1,shooting
20,Festival crash – DC,0.0,1.0,0.0,3,crash
37,Double murder – Ken.,2.0,1.0,0.0,1,murder
45,Teen car shooting – Colo.,1.0,1.0,0.0,0,shooting
48,Taxi murders – Tex.,2.0,1.0,0.0,1,murder
50,Midair plane crash – Calif.,5.0,1.0,0.0,2,crash
53,Pre-murder 911 – Calif.,1.0,1.0,0.0,0,murder
55,Accidental shooting – Fla.,1.0,1.0,0.0,0,shooting
62,Hostage-murder,3.0,1.0,0.0,6,murder


In [10]:
%time
agg_by_cat_df = proc_df.groupby("category").agg(
    incedent_num=("category", "count"),
    death_num=("deaths", "sum"),
    mean_death_num=("deaths", "mean"),
    false_alarm=("false_alarm", "mean"),
    clusters=("clustered", lambda x: list(x)),
    max_cluster=("clustered", pd.Series.mode),
).reset_index()

agg_by_clust_df = proc_df.groupby("clustered").agg(
    incedent_num=("clustered", "count"),
    death_num=("deaths", "sum"),
    mean_death_num=("deaths", "mean"),
    false_alarm=("false_alarm", "mean"),
    titles=("title", lambda x: list(x)),
    categories=("category", lambda x: list(x)),
    max_category=("category", pd.Series.mode),
).reset_index()

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.58 μs


In [11]:
print(agg_by_cat_df)
print(agg_by_clust_df)

   category  incedent_num  death_num  mean_death_num  false_alarm  \
0  burglary             8        0.0        0.000000          0.0   
1     crash            23       87.0        3.782609          0.0   
2      fire            20       46.0        2.300000          0.0   
3  invasion             5        2.0        0.400000          0.0   
4    murder            50      109.0        2.180000          0.0   
5   robbery            11        1.0        0.090909          0.0   
6  shooting            96      208.0        2.166667          0.0   
7  stabbing             4        4.0        1.000000          0.0   

                                                                                                                                                                                                                                                                                                clusters  \
0                                                                             

In [12]:
%time
agg_by_cat_df.to_json(AGG_CAT)
agg_by_clust_df.to_json(AGG_CLUST)

CPU times: user 2 μs, sys: 1 μs, total: 3 μs
Wall time: 5.25 μs


In [13]:
%time
res = pd.read_json(AGG_CAT)
res = res[
    (
        ((res["mean_death_num"] > 2) & (res["clusters"].__contains__(3))) |
        ((res["death_num"] < 10) & (res["clusters"].__contains__(1)))
    ) &
    ((res["incedent_num"] > 10) | (res["false_alarm"] > 0))
].sort_values("category")

CPU times: user 2 μs, sys: 1e+03 ns, total: 3 μs
Wall time: 5.25 μs


In [14]:
res

Unnamed: 0,category,incedent_num,death_num,mean_death_num,false_alarm,clusters,max_cluster
1,crash,23,87,3.782609,0,"[3, 2, 3, 3, 6, 3, 3, 1, 1, 3, 1, 1, 6, 6, -1, -1, 10, 1, 3, 3, 1, 1, 1]","[1, 3]"
2,fire,20,46,2.3,0,"[2, 0, 1, 1, 1, 1, 6, 0, 3, 0, 4, 3, -1, 3, 3, 3, 3, 10, 10, 6]",3
4,murder,50,109,2.18,0,"[1, 1, 0, 6, 1, 0, 5, -1, 0, 1, 1, 5, 5, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 6, 0, 1, 1, 0, 0, 0, 1, 10, 0, 0, -1, 5, 3, 0, 0, 1, 1, 1, 0, 1, 0, 1, 3]",0
5,robbery,11,1,0.090909,0,"[3, 4, 0, 3, 3, 3, 3, 3, 3, 3, 3]",3
6,shooting,96,208,2.166667,0,"[1, 0, 0, 1, 0, 3, 0, 1, 1, 1, 1, 0, 0, 3, 0, 3, 3, 0, 2, 0, -1, 8, 8, 6, 3, 0, 0, 0, 3, 0, 5, 0, 3, 0, 6, 3, 1, 3, 6, 6, 6, 6, 0, 3, 3, 0, 1, 3, 0, 0, 0, -1, -1, -1, 1, 1, 0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 3, 0, 0, 3, 0, 3, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 3, 3, 3, 0, 0, 1, 1, -1]",0


In [15]:
%time
res = pd.read_json(AGG_CLUST)
res = res[
    res["categories"].apply(lambda x: "fire" in x and "crash" in x) & 
    (res["incedent_num"] < 30)
]

CPU times: user 3 μs, sys: 1 μs, total: 4 μs
Wall time: 5.96 μs


In [16]:
res

Unnamed: 0,clustered,incedent_num,death_num,mean_death_num,false_alarm,titles,categories,max_category
0,-1,10,174,17.4,0,"[Multiple murders, Multiple shootings, School shooting, School shooting, School shooting, Train crash, Train crash, Shooting &amp; arson fire, Mass murders, Workplace shooting]","[murder, shooting, shooting, shooting, shooting, crash, crash, fire, murder, shooting]",shooting
3,2,3,15,5.0,0,"[Maryland fire 911, Midair plane crash – Calif., iHop shooting]","[fire, crash, shooting]","[crash, fire, shooting]"
7,6,14,42,3.0,0,"[Hostage-murder, Plane crash, Workplace shooting, Fatal fire, Supermarket shootings, Mall shooting, Mall shooting, Mall shooting, Mall shooting, Small plane crash, Small plane crash, Domestic shooting, Multiple murders, Fatal fire]","[murder, crash, shooting, fire, shooting, shooting, shooting, shooting, shooting, crash, crash, shooting, murder, fire]",shooting
9,10,4,28,7.0,0,"[Jet plane crash, Multiple murders, Fatal fire, Fatal fire]","[crash, murder, fire, fire]",fire
