In [1]:
import pandas as pd
import sqlite3

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [3]:
con = sqlite3.Connection("./switrs.sqlite")

### How many collisions are there in the dataset?

In [4]:
QUERY1 = """
SELECT COUNT(1) AS collision_count
FROM collisions
"""

In [5]:
pd.read_sql(QUERY1, con).head()

Unnamed: 0,collision_count
0,9172565


### What percent of collisions involve males aged 16-25?

In [6]:
QUERY2 = """
SELECT 
    COUNT(DISTINCT case_id) 
    / (SELECT CAST(COUNT(DISTINCT case_id) AS FLOAT) FROM parties)
    AS percentage
FROM parties
WHERE party_sex = 'male'
AND party_age BETWEEN 16 AND 25
"""

In [7]:
pd.read_sql(QUERY2, con)

Unnamed: 0,percentage
0,0.241562


### How many solo motorcycle crashes are there per year?

In [8]:
QUERY3 = """
SELECT
  STRFTIME('%Y', collision_date) AS collision_year,
  COUNT(1) AS collision_count
FROM collisions
WHERE motorcycle_collision = True
  AND party_count = 1
GROUP BY collision_year
ORDER BY collision_year
"""

In [9]:
pd.read_sql(QUERY3, con)

Unnamed: 0,collision_year,collision_count
0,2001,3258
1,2002,3393
2,2003,3822
3,2004,3955
4,2005,3755
5,2006,3967
6,2007,4513
7,2008,4948
8,2009,4266
9,2010,3902


### What make of vehicle has the largest fraction of accidents on the weekend? During the work week?

Only consider vehciles with at least 10,000 collisions or more.

In [10]:
QUERY4 = """
WITH counter AS (
  SELECT
    p.vehicle_make AS make, 
    SUM(
      CASE WHEN STRFTIME('%w', c.collision_date) IN ('0', '6') THEN 1 ELSE 0 END
    ) AS weekend_count,
    SUM(
      CASE WHEN STRFTIME('%w', c.collision_date) IN ('0', '6') THEN 0 ELSE 1 END
    ) AS weekday_count,
    count(1) AS total
  FROM collisions AS c
  LEFT JOIN parties AS p
    ON c.case_id = p.case_id
  GROUP BY make
  HAVING total >= 10000
)

SELECT * FROM (
  SELECT 
    *,
    weekend_count / CAST(total AS FLOAT) AS weekend_fraction,
    weekday_count / CAST(total AS FLOAT) AS weekday_fraction
  FROM counter
  ORDER BY weekend_fraction DESC
  LIMIT 1
)

UNION

SELECT * FROM (
  SELECT 
    *,
    weekend_count / CAST(total AS FLOAT) AS weekend_fraction,
    weekday_count / CAST(total AS FLOAT) AS weekday_fraction
  FROM counter
  ORDER BY weekday_fraction DESC
  LIMIT 1
)
"""

In [11]:
pd.read_sql(QUERY4, con).head()

Unnamed: 0,make,weekend_count,weekday_count,total,weekend_fraction,weekday_fraction
0,HARLEY-DAVIDSON,19125,30477,49602,0.385569,0.614431
1,PETERBILT,6477,64102,70579,0.09177,0.90823


### How many different values represent "Toyota" in the Parties database? How would you go about correcting for this?

In [12]:
QUERY5 = """
SELECT 
  vehicle_make,
  COUNT(1) AS number_seen
FROM parties
WHERE LOWER(vehicle_make) = 'toyota'
  OR LOWER(vehicle_make) LIKE 'toy%'
  OR LOWER(vehicle_make) LIKE 'ty%'
GROUP BY vehicle_make
ORDER BY number_seen DESC 
"""

In [13]:
pd.read_sql(QUERY5, con).head(100)

Unnamed: 0,vehicle_make,number_seen
0,TOYOTA,2374621
1,TOYO,166209
2,TOYT,146746
3,TOYOT,2823
4,TOY,2262
5,TOYTA,246
6,TOYOTA/,181
7,TOYTO,84
8,TOYTOA,71
9,TOYOYA,66
