## Creating new features for nctraffic

In [3]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

Load original traffic database

In [2]:
dbname = 'traffic_stops_nc'
username = 'along528'
pswd = 'password'
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

And Load police dataset

In [None]:
dbname = 'police_bjs'
username = 'along528'
pswd = 'password'
police_con = None
police_con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

And create new database for adding features

In [77]:
dbname = 'traffic_police_combined'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/traffic_police_combined


In [78]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

True


# Tools for building features
We want features that are grouped by matched agency and year. Where each feature is split by race. And we include a total.

In [44]:
races = ['B','W','A','I','U']
race_descriptive = ['black','white','asian','native_american','other']
import pickle
agencyid_map_police_to_traffic = pickle.load( open( "../nctraffic_police_combined/agencyid_map_police_to_traffic.p", "rb" ) )
agencyid_map_traffic_to_police = pickle.load( open( "../nctraffic_police_combined/agencyid_map_traffic_to_police.p", "rb" ) )
def build_features_by_race(query,tag,max_agencies=-1):
    n_agencies = 0
    frame = pd.DataFrame()
    frames = []
    for agency_id in agencyid_map_traffic_to_police:
        counts_by_race={}
        for race,desc in zip(races,race_descriptive):
            query_tmp = query.replace("<AGENCY_ID>",str(agency_id))
            query_tmp = query_tmp.replace("<RACE>",race)
            counts_by_race[tag+"_"+desc]=pd.read_sql_query(query_tmp,con).set_index('year')['count']
        frame_agency = pd.DataFrame(counts_by_race)
        frame_agency['agency_id'] = agency_id
        frames.append(frame_agency)
        n_agencies+=1
        if max_agencies>0 and n_agencies>=max_agencies: break
    frame = pd.concat(frames)
    frame = frame.fillna(0)
    frame[tag+'_total'] = frame.sum(axis=1)
    frame = frame.reset_index()
    return frame

# Stops by race
First add one feature for the number of stops by race per (matched) agency per year

In [6]:
tag = "stops"
sql_query = """
            SELECT count(person_id),
               extract(YEAR FROM s.date) AS year
            FROM nc_person p
            JOIN nc_stop s ON p.stop_id = s.stop_id
            WHERE p.type='D'
              AND s.agency_id = <AGENCY_ID>
              AND p.race = '<RACE>'
            GROUP BY p.race,
                     year
            ORDER BY year ASC, p.race DESC;
            """
frame = build_features_by_race(sql_query,tag)
frame.to_sql(tag+'_by_race', engine, if_exists='replace')
frame

Unnamed: 0,year,stops_asian,stops_black,stops_native_american,stops_other,stops_white,agency_id,stops_total
0,2002,1,367,1,51,532,2,954
1,2003,3,155,1,67,423,2,651
2,2004,15,508,5,226,969,2,1725
3,2005,1,147,0,28,300,2,478
4,2006,2,75,1,33,257,2,370
5,2007,3,113,0,28,280,2,426
6,2008,4,182,0,11,501,2,700
7,2009,16,861,0,12,1870,2,2761
8,2010,17,1311,23,11,2988,2,4352
9,2011,15,1076,12,8,2585,2,3698


# Searches by race
First add one feature for the number of searches by race per (matched) agency per year

In [7]:
tag = "searches"
#if it appears in the searches database then it is a search
sql_query = """
SELECT count(se.person_id),
       extract(YEAR FROM s.date) AS year
FROM  nc_person p
JOIN nc_stop s ON p.stop_id = s.stop_id
JOIN nc_search se ON s.stop_id = se.stop_id
WHERE p.type='D'
  AND s.agency_id = <AGENCY_ID>
  AND p.race = '<RACE>'
GROUP BY p.race,
         year
ORDER BY year ASC,
         p.race DESC;
"""
frame = build_features_by_race(sql_query,tag)
frame.to_sql(tag+'_by_race', engine, if_exists='replace')
frame

Unnamed: 0,year,searches_asian,searches_black,searches_native_american,searches_other,searches_white,agency_id,searches_total
0,2002,0,95,1,17,154,2,269
1,2003,0,23,1,14,87,2,127
2,2004,1,81,0,66,155,2,305
3,2005,0,24,0,7,43,2,76
4,2006,0,3,0,2,11,2,18
5,2007,0,9,0,6,19,2,36
6,2008,1,12,0,2,45,2,62
7,2009,2,139,0,4,300,2,447
8,2010,0,209,5,1,398,2,615
9,2011,1,167,0,0,316,2,486


# Use of force by race
First add one feature for the number of uses of force by race per (matched) agency per year

In [8]:
tag = "force"
#if it appears in the searches database then it is a search
sql_query = """
SELECT count(se.person_id),
       extract(YEAR FROM s.date) AS year
FROM  nc_person p
JOIN nc_stop s ON p.stop_id = s.stop_id
JOIN nc_search se ON s.stop_id = se.stop_id
WHERE p.type='D'
  AND s.agency_id = <AGENCY_ID>
  AND p.race = '<RACE>'
  AND s.engage_force = 't'
GROUP BY p.race,
         year
ORDER BY year ASC,
         p.race DESC;
"""
frame = build_features_by_race(sql_query,tag)
frame.to_sql(tag+'_by_race', engine, if_exists='replace')
frame

Unnamed: 0,year,force_asian,force_black,force_native_american,force_other,force_white,agency_id,force_total
0,2002,0,3,0,0,1,2,6
1,2003,0,1,0,0,1,2,4
2,2004,0,3,0,0,2,2,7
3,2007,0,0,0,0,1,2,3
4,2008,0,1,0,0,1,2,4
5,2009,0,1,0,0,4,2,7
6,2010,0,3,0,0,1,2,6
7,2011,0,1,0,0,2,2,5
8,2012,0,4,0,0,1,2,7
9,2013,0,3,0,0,1,2,6


# Contraband hits by race
First add one feature for the number of hits by race per (matched) agency per year

In [9]:
tag = "hits"
#if it appears in the contraband database then it is a search
sql_query = """
SELECT count(c.person_id),
       extract(YEAR FROM s.date) AS year
FROM  nc_person p
JOIN nc_stop s ON p.stop_id = s.stop_id
JOIN nc_search se ON s.stop_id = se.stop_id
JOIN nc_contraband c ON se.search_id = c.search_id
WHERE p.type='D'
  AND s.agency_id = <AGENCY_ID>
  AND p.race = '<RACE>'
GROUP BY p.race,
         year
ORDER BY year ASC,
         p.race DESC;
"""
frame = build_features_by_race(sql_query,tag)
frame.to_sql(tag+'_by_race', engine, if_exists='replace')
frame

Unnamed: 0,year,hits_asian,hits_black,hits_native_american,hits_other,hits_white,agency_id,hits_total
0,2002,0,18,1,3,25,2,49
1,2003,0,4,0,2,15,2,23
2,2004,0,31,0,7,38,2,78
3,2005,0,7,0,1,11,2,21
4,2006,0,0,0,1,4,2,7
5,2007,0,1,0,0,5,2,8
6,2008,0,2,0,0,11,2,15
7,2009,1,43,0,1,77,2,124
8,2010,0,76,0,0,129,2,207
9,2011,1,50,0,0,92,2,145


# Load in tables after having created them

In [4]:
dbname = 'traffic_police_combined'
username = 'along528'
pswd = 'password'
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [8]:
frames_new = {}
for tag in ["stops","hits","searches","force"]: 
    sql_query = "SELECT * FROM %s_by_race;" % (tag)
    frames_new[tag] = pd.read_sql_query(sql_query,con)

# Build ratios of features

Note that hits is a low stat number

In [39]:
#if it appears in the searches database then it is a search
sql_query = """
SELECT s.agency_id,s.year,
       h.hits_black/nullif(s.stops_black,0) AS "hits_over_stops_black",
       h.hits_white/nullif(s.stops_white,0) AS "hits_over_stops_white",
       h.hits_asian/nullif(s.stops_asian,0) AS "hits_over_stops_asian",
       h.hits_native_american/nullif(s.stops_native_american,0) AS "hits_over_stops_native_american",
       h.hits_other/nullif(s.stops_other,0) AS "hits_over_stops_other",
       h.hits_total/nullif(s.stops_total,0) AS "hits_over_stops_total"
FROM stops_by_race s 
JOIN hits_by_race h ON s.agency_id = h.agency_id AND s.year = h.year;
"""
hits_over_stops = pd.read_sql_query(sql_query,con)
hits_over_stops = hits_over_stops.fillna(0)
hits_over_stops['hits_over_stops_black_over_white'] = \
                            hits_over_stops['hits_over_stops_black'].\
                            divide(hits_over_stops['hits_over_stops_white'],axis='index')
hits_over_stops.to_sql('hits_over_stops', engine, if_exists='replace')
hits_over_stops

Unnamed: 0,agency_id,year,hits_over_stops_black,hits_over_stops_white,hits_over_stops_asian,hits_over_stops_native_american,hits_over_stops_other,hits_over_stops_total,hits_over_stops_black_over_white
0,2,2002,0.049046,0.046992,0.000000,1.000000,0.058824,0.051363,1.043706
1,2,2003,0.025806,0.035461,0.000000,0.000000,0.029851,0.035330,0.727742
2,2,2004,0.061024,0.039216,0.000000,0.000000,0.030973,0.045217,1.556102
3,2,2005,0.047619,0.036667,0.000000,0.000000,0.035714,0.043933,1.298701
4,2,2006,0.000000,0.015564,0.000000,0.000000,0.030303,0.018919,0.000000
5,2,2007,0.008850,0.017857,0.000000,0.000000,0.000000,0.018779,0.495575
6,2,2008,0.010989,0.021956,0.000000,0.000000,0.000000,0.021429,0.500500
7,2,2009,0.049942,0.041176,0.062500,0.000000,0.083333,0.044911,1.212875
8,2,2010,0.057971,0.043173,0.000000,0.000000,0.000000,0.047564,1.342770
9,2,2011,0.046468,0.035590,0.066667,0.000000,0.000000,0.039210,1.305661


In [38]:
#if it appears in the searches database then it is a search
sql_query = """
SELECT s.agency_id,s.year,
       h.hits_black/nullif(s.searches_black,0) AS "hits_over_searches_black",
       h.hits_white/nullif(s.searches_white,0) AS "hits_over_searches_white",
       h.hits_asian/nullif(s.searches_asian,0) AS "hits_over_searches_asian",
       h.hits_native_american/nullif(s.searches_native_american,0) AS "hits_over_searches_native_american",
       h.hits_other/nullif(s.searches_other,0) AS "hits_over_searches_other",
       h.hits_total/nullif(s.searches_total,0) AS "hits_over_searches_total"
FROM searches_by_race s 
JOIN hits_by_race h ON s.agency_id = h.agency_id AND s.year = h.year;
"""
hits_over_searches = pd.read_sql_query(sql_query,con)
hits_over_searches = hits_over_searches.fillna(0)
hits_over_searches['hits_over_searches_black_over_white'] = \
                            hits_over_searches['hits_over_searches_black'].\
                            divide(hits_over_searches['hits_over_searches_white'],axis='index')
hits_over_searches.to_sql('hits_over_searches', engine, if_exists='replace')
hits_over_searches


Unnamed: 0,agency_id,year,hits_over_searches_black,hits_over_searches_white,hits_over_searches_asian,hits_over_searches_native_american,hits_over_searches_other,hits_over_searches_total,hits_over_searches_black_over_white
0,2,2002,0.189474,0.162338,0.000000,1.000000,0.176471,0.182156,1.167158
1,2,2003,0.173913,0.172414,0.000000,0.000000,0.142857,0.181102,1.008696
2,2,2004,0.382716,0.245161,0.000000,0.000000,0.106061,0.255738,1.561079
3,2,2005,0.291667,0.255814,0.000000,0.000000,0.142857,0.276316,1.140152
4,2,2006,0.000000,0.363636,0.000000,0.000000,0.500000,0.388889,0.000000
5,2,2007,0.111111,0.263158,0.000000,0.000000,0.000000,0.222222,0.422222
6,2,2008,0.166667,0.244444,0.000000,0.000000,0.000000,0.241935,0.681818
7,2,2009,0.309353,0.256667,0.500000,0.000000,0.250000,0.277405,1.205270
8,2,2010,0.363636,0.324121,0.000000,0.000000,0.000000,0.336585,1.121917
9,2,2011,0.299401,0.291139,1.000000,0.000000,0.000000,0.298354,1.028378


In [40]:

#if it appears in the searches database then it is a search
sql_query = """
SELECT se.agency_id,se.year,
       se.searches_black / nullif(s.stops_black,0) AS "searches_over_stops_black",
       se.searches_white / nullif(s.stops_white,0) AS "searches_over_stops_white",
       se.searches_asian / nullif(s.stops_asian,0) AS "searches_over_stops_asian",
       se.searches_native_american / nullif(s.stops_native_american,0) AS "searches_over_stops_native_american",
       se.searches_other / nullif(s.stops_other,0) AS "searches_over_stops_other",
       se.searches_total / nullif(s.stops_total,0) AS "searches_over_stops_total"
FROM searches_by_race se
JOIN stops_by_race s ON se.agency_id = s.agency_id AND se.year = s.year;
"""
searches_over_stops = pd.read_sql_query(sql_query,con)
searches_over_stops = searches_over_stops.fillna(0)
searches_over_stops['searches_over_stops_black_over_white'] = \
                            searches_over_stops['searches_over_stops_black'].\
                            divide(searches_over_stops['searches_over_stops_white'],axis='index')
searches_over_stops.to_sql('searches_over_stops', engine, if_exists='replace')
searches_over_stops


Unnamed: 0,agency_id,year,searches_over_stops_black,searches_over_stops_white,searches_over_stops_asian,searches_over_stops_native_american,searches_over_stops_other,searches_over_stops_total,searches_over_stops_black_over_white
0,2,2002,0.258856,0.289474,0.000000,1.000000,0.333333,0.281971,0.894228
1,2,2003,0.148387,0.205674,0.000000,1.000000,0.208955,0.195084,0.721468
2,2,2004,0.159449,0.159959,0.066667,0.000000,0.292035,0.176812,0.996812
3,2,2005,0.163265,0.143333,0.000000,0.000000,0.250000,0.158996,1.139060
4,2,2006,0.040000,0.042802,0.000000,0.000000,0.060606,0.048649,0.934545
5,2,2007,0.079646,0.067857,0.000000,0.000000,0.214286,0.084507,1.173731
6,2,2008,0.065934,0.089820,0.250000,0.000000,0.181818,0.088571,0.734066
7,2,2009,0.161440,0.160428,0.125000,0.000000,0.333333,0.161898,1.006310
8,2,2010,0.159420,0.133199,0.000000,0.217391,0.090909,0.141314,1.196854
9,2,2011,0.155204,0.122244,0.066667,0.000000,0.000000,0.131422,1.269631


In [35]:
searches_over_stops[searches_over_stops['agency_id']==2]

Unnamed: 0,agency_id,year,searches_over_stops_black,searches_over_stops_white,searches_over_stops_asian,searches_over_stops_native_american,searches_over_stops_other,searches_over_stops_total,searches_over_stops_black_over_white
0,2,2002,0.258856,0.289474,0.0,1.0,0.333333,0.281971,0.894228
1,2,2003,0.148387,0.205674,0.0,1.0,0.208955,0.195084,0.721468
2,2,2004,0.159449,0.159959,0.066667,0.0,0.292035,0.176812,0.996812
3,2,2005,0.163265,0.143333,0.0,0.0,0.25,0.158996,1.13906
4,2,2006,0.04,0.042802,0.0,0.0,0.060606,0.048649,0.934545
5,2,2007,0.079646,0.067857,0.0,0.0,0.214286,0.084507,1.173731
6,2,2008,0.065934,0.08982,0.25,0.0,0.181818,0.088571,0.734066
7,2,2009,0.16144,0.160428,0.125,0.0,0.333333,0.161898,1.00631
8,2,2010,0.15942,0.133199,0.0,0.217391,0.090909,0.141314,1.196854
9,2,2011,0.155204,0.122244,0.066667,0.0,0.0,0.131422,1.269631


# New Police BJS dataset

In [79]:
sql_query='''
SELECT *
FROM police_data_table p
WHERE p.state='NC';
'''       
nc_police = pd.read_sql_query(sql_query,police_con)
#only keep those that overlap with the previous
nc_police = nc_police[nc_police['SURVEYID'].isin(agencyid_map_police_to_traffic.keys())]
#use the index from the traffic dataset for the training, but keep the SURVEYID
nc_police['agency_id'] = nc_police['SURVEYID'].map(agencyid_map_police_to_traffic)
nc_police.to_sql('nc_police_survey', engine, if_exists='replace')
nc_police

Unnamed: 0,SURVEYID,formtype,resptype,agcytype,agency,city,state,zipcode,swnauthemp,swnftemp,...,imphumntask,impterrtask,impcell,ori,csllea04_id,population,lpdsampgrp,finalwt_page1,finalwt_page2on,agency_id
0,592,L,full,5,NORTHCAROLINASTATEHWYPATROL,Raleigh,NC,27611,1823,1718,...,,,,,,,,,,191
1,570,L,full,3,CHARLOTTE-MECKLENBURGPOLICEDEPT,Charlotte,NC,28202,1628,1481,...,,,,,,,,,,51
2,589,L,full,3,JACKSONVILLEPOLICEDEPARTMENT,Jacksonville,NC,28541,111,107,...,,,,,,,,,,130
3,2127,S,full,3,HOLLYSPRINGSPOLICEDEPARTMENT,HollySprings,NC,27540,36,36,...,,,,,,,,,,123
4,571,L,full,3,CONCORDPOLICEDEPARTMENT,Concord,NC,28026,150,148,...,,,,,,,,,,63
5,583,L,full,3,GREENVILLEPOLICEDEPARTMENT,Greenville,NC,27835,170,170,...,,,,,,,,,,105
6,2165,S,full,3,SURFCITYPOLICEDEPARTMENT,SurfCity,NC,28445,17,17,...,,,,,,,,,,265
7,581,L,full,3,GOLDSBOROPOLICEDEPARTMENT,Goldsboro,NC,27533,112,107,...,,,,,,,,,,100
9,2141,S,full,3,MINTHILLPOLICEDEPARTMENT,MintHill,NC,28227,28,28,...,,,,,,,,,,169
10,2154,S,full,3,PILOTMOUNTAINPOLICEDEPT,PilotMountain,NC,27041,8,8,...,,,,,,,,,,216


# Join column