## Creating new features for nctraffic

In [1]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

Load original database

In [2]:
dbname = 'traffic_stops_nc'
username = 'along528'
pswd = 'password'
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

And create new database for adding features

In [3]:
dbname = 'traffic_police_combined'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/traffic_police_combined


In [4]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

True


# Tools for building features
We want features that are grouped by matched agency and year. Where each feature is split by race. And we include a total.

In [5]:
races = ['B','W','A','I','U']
race_descriptive = ['black','white','asian','native_american','other']
import pickle
agencyid_map_police_to_traffic = pickle.load( open( "../nctraffic_police_combined/agencyid_map_police_to_traffic.p", "rb" ) )
agencyid_map_traffic_to_police = pickle.load( open( "../nctraffic_police_combined/agencyid_map_traffic_to_police.p", "rb" ) )
def build_features_by_race(query,tag,max_agencies=-1):
    n_agencies = 0
    frame = pd.DataFrame()
    frames = []
    for agency_id in agencyid_map_traffic_to_police:
        counts_by_race={}
        for race,desc in zip(races,race_descriptive):
            query_tmp = query.replace("<AGENCY_ID>",str(agency_id))
            query_tmp = query_tmp.replace("<RACE>",race)
            counts_by_race[tag+"_"+desc]=pd.read_sql_query(query_tmp,con).set_index('year')['count']
        frame_agency = pd.DataFrame(counts_by_race)
        frame_agency['agency_id'] = agency_id
        frames.append(frame_agency)
        n_agencies+=1
        if max_agencies>0 and n_agencies>=max_agencies: break
    frame = pd.concat(frames)
    frame = frame.fillna(0)
    frame[tag+'_total'] = frame.sum(axis=1)
    frame = frame.reset_index()
    return frame

# Stops by race
First add one feature for the number of stops by race per (matched) agency per year

In [6]:
tag = "stops"
sql_query = """
            SELECT count(person_id),
               extract(YEAR FROM s.date) AS year
            FROM nc_person p
            JOIN nc_stop s ON p.stop_id = s.stop_id
            WHERE p.type='D'
              AND s.agency_id = <AGENCY_ID>
              AND p.race = '<RACE>'
            GROUP BY p.race,
                     year
            ORDER BY year ASC, p.race DESC;
            """
frame = build_features_by_race(sql_query,tag)
frame.to_sql(tag+'_by_race', engine, if_exists='replace')
frame

Unnamed: 0,year,stops_asian,stops_black,stops_native_american,stops_other,stops_white,agency_id,stops_total
0,2002,1,367,1,51,532,2,954
1,2003,3,155,1,67,423,2,651
2,2004,15,508,5,226,969,2,1725
3,2005,1,147,0,28,300,2,478
4,2006,2,75,1,33,257,2,370
5,2007,3,113,0,28,280,2,426
6,2008,4,182,0,11,501,2,700
7,2009,16,861,0,12,1870,2,2761
8,2010,17,1311,23,11,2988,2,4352
9,2011,15,1076,12,8,2585,2,3698


# Searches by race
First add one feature for the number of searches by race per (matched) agency per year

In [7]:
tag = "searches"
#if it appears in the searches database then it is a search
sql_query = """
SELECT count(se.person_id),
       extract(YEAR FROM s.date) AS year
FROM  nc_person p
JOIN nc_stop s ON p.stop_id = s.stop_id
JOIN nc_search se ON s.stop_id = se.stop_id
WHERE p.type='D'
  AND s.agency_id = <AGENCY_ID>
  AND p.race = '<RACE>'
GROUP BY p.race,
         year
ORDER BY year ASC,
         p.race DESC;
"""
frame = build_features_by_race(sql_query,tag)
frame.to_sql(tag+'_by_race', engine, if_exists='replace')
frame

Unnamed: 0,year,searches_asian,searches_black,searches_native_american,searches_other,searches_white,agency_id,searches_total
0,2002,0,95,1,17,154,2,269
1,2003,0,23,1,14,87,2,127
2,2004,1,81,0,66,155,2,305
3,2005,0,24,0,7,43,2,76
4,2006,0,3,0,2,11,2,18
5,2007,0,9,0,6,19,2,36
6,2008,1,12,0,2,45,2,62
7,2009,2,139,0,4,300,2,447
8,2010,0,209,5,1,398,2,615
9,2011,1,167,0,0,316,2,486


# Use of force by race
First add one feature for the number of uses of force by race per (matched) agency per year

In [8]:
tag = "force"
#if it appears in the searches database then it is a search
sql_query = """
SELECT count(se.person_id),
       extract(YEAR FROM s.date) AS year
FROM  nc_person p
JOIN nc_stop s ON p.stop_id = s.stop_id
JOIN nc_search se ON s.stop_id = se.stop_id
WHERE p.type='D'
  AND s.agency_id = <AGENCY_ID>
  AND p.race = '<RACE>'
  AND s.engage_force = 't'
GROUP BY p.race,
         year
ORDER BY year ASC,
         p.race DESC;
"""
frame = build_features_by_race(sql_query,tag)
frame.to_sql(tag+'_by_race', engine, if_exists='replace')
frame

Unnamed: 0,year,force_asian,force_black,force_native_american,force_other,force_white,agency_id,force_total
0,2002,0,3,0,0,1,2,6
1,2003,0,1,0,0,1,2,4
2,2004,0,3,0,0,2,2,7
3,2007,0,0,0,0,1,2,3
4,2008,0,1,0,0,1,2,4
5,2009,0,1,0,0,4,2,7
6,2010,0,3,0,0,1,2,6
7,2011,0,1,0,0,2,2,5
8,2012,0,4,0,0,1,2,7
9,2013,0,3,0,0,1,2,6


# Contraband hits by race
First add one feature for the number of hits by race per (matched) agency per year

In [9]:
tag = "hits"
#if it appears in the contraband database then it is a search
sql_query = """
SELECT count(c.person_id),
       extract(YEAR FROM s.date) AS year
FROM  nc_person p
JOIN nc_stop s ON p.stop_id = s.stop_id
JOIN nc_search se ON s.stop_id = se.stop_id
JOIN nc_contraband c ON se.search_id = c.search_id
WHERE p.type='D'
  AND s.agency_id = <AGENCY_ID>
  AND p.race = '<RACE>'
GROUP BY p.race,
         year
ORDER BY year ASC,
         p.race DESC;
"""
frame = build_features_by_race(sql_query,tag)
frame.to_sql(tag+'_by_race', engine, if_exists='replace')
frame

Unnamed: 0,year,hits_asian,hits_black,hits_native_american,hits_other,hits_white,agency_id,hits_total
0,2002,0,18,1,3,25,2,49
1,2003,0,4,0,2,15,2,23
2,2004,0,31,0,7,38,2,78
3,2005,0,7,0,1,11,2,21
4,2006,0,0,0,1,4,2,7
5,2007,0,1,0,0,5,2,8
6,2008,0,2,0,0,11,2,15
7,2009,1,43,0,1,77,2,124
8,2010,0,76,0,0,129,2,207
9,2011,1,50,0,0,92,2,145
