# Combine traffic data in similar format

In [4]:
from glob import glob
import sets
import datetime
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
from collections import defaultdict

In [5]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/combined_profiling


In [6]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
con_comb = None
con_comb = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

## Illinois traffic data

In [7]:
dbname = 'traffic_stops_il'
username = 'along528'
pswd = 'password'
con_il = None
con_il = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

Even though we have data from 2004 - 2014, the Chicago PD reported 'Yes' for contrabandfound for every stop from 2004-2006. Since this is suspect let's just start from 2007.

In [8]:

il_year_start = 2007 
il_year_stop = 2014
df_race_map = defaultdict(list)
for race in ['black','white']:
    for year in range(il_year_start,il_year_stop+1):
        query = """
        SELECT agencycode,agencyname,count(agencycode) AS stops_<RACE>,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_<RACE>,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_<RACE>,
        year 
        FROM traffic_stops_<YEAR>
        WHERE race = '<RACE>'
        GROUP BY agencycode,agencyname,year
        """
        query = query.replace('<RACE>',race)
        query = query.replace('<YEAR>',str(year))
        df_race_map[race].append(pd.read_sql(query,con_il))


In [9]:
for year in range(il_year_start,il_year_stop+1):
        query = """
        SELECT agencycode,agencyname,count(agencycode) AS stops_total,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_total,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_total,
        year 
        FROM traffic_stops_<YEAR>
        GROUP BY agencycode,agencyname,year
        """
        query = query.replace('<YEAR>',str(year))
        df_race_map['total'].append(pd.read_sql(query,con_il))

In [10]:
dfs={}
for key in df_race_map:
    dfs[key] = pd.concat(df_race_map[key])

In [11]:
for key in dfs:
    print key,len(dfs[key])

white 6919
total 6934
black 5972


In [12]:
dfs['total']

Unnamed: 0,agencycode,agencyname,stops_total,searches_total,hits_total,year
0,10011,Dolton Police Department,681,6,3,2007
1,12987,South Suburban College Police,147,0,0,2007
2,12988,South Holland Police,4633,421,63,2007
3,12989,South Chicago Heights Police,793,15,2,2007
4,12990,Skokie Police,17638,883,101,2007
5,12991,Schiller Park Police,1817,47,7,2007
6,12992,Schaumburg Police,8784,58,4,2007
7,12993,Sauk Village Police,1686,149,43,2007
8,12994,Rosemont Police,810,104,10,2007
9,12995,Rolling Meadows Police,6359,588,75,2007


In [13]:
df_il = pd.DataFrame(dfs['total'])
df_il = df_il.merge(dfs['white'],how='left')
df_il = df_il.merge(dfs['black'],how='left')
df_il = df_il.fillna(0)
df_il.rename(columns=lambda x: x.replace('agencycode','agencyid'),inplace=True)
df_il

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,year,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,10011,Dolton Police Department,681,6,3,2007,59,1,0,603,5,3
1,12987,South Suburban College Police,147,0,0,2007,24,0,0,111,0,0
2,12988,South Holland Police,4633,421,63,2007,527,22,5,3837,362,53
3,12989,South Chicago Heights Police,793,15,2,2007,227,6,1,458,4,1
4,12990,Skokie Police,17638,883,101,2007,11743,329,52,2086,216,27
5,12991,Schiller Park Police,1817,47,7,2007,1302,19,2,168,3,1
6,12992,Schaumburg Police,8784,58,4,2007,6490,32,2,707,14,1
7,12993,Sauk Village Police,1686,149,43,2007,525,40,15,999,92,25
8,12994,Rosemont Police,810,104,10,2007,496,43,5,90,13,2
9,12995,Rolling Meadows Police,6359,588,75,2007,4350,245,29,501,51,10


In [14]:
df_il.to_sql('il_traffic_stops_split_by_year',engine,if_exists='replace')

In [15]:
query = """
SELECT agencyid,
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM il_traffic_stops_split_by_year
GROUP BY agencyid,agencyname;
"""
df_il_integrate = pd.read_sql(query,con_comb)
df_il_integrate

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,13228,Malta Police,2734,13,2,1810,3,1,574,6,0
1,13689,Meredosia Police,1971,22,7,1876,21,7,52,0,0
2,13981,Cherry Valley Police,11735,590,312,8692,309,166,1430,113,43
3,13636,Toluca Police,100,1,0,92,1,0,3,0,0
4,13756,Rock Island Police,57484,5873,1040,34975,3082,538,19253,2484,468
5,13661,Mercer County Sheriff,6096,1456,100,5406,1276,86,270,79,7
6,13843,Cowden Police,590,21,3,581,21,3,5,0,0
7,13604,Wilsonville Police,371,17,10,367,17,10,1,0,0
8,14015,Sycamore Police,22802,64,21,19188,45,17,1695,8,2
9,13717,Monticello Police,6608,253,55,5983,174,41,420,50,12


In [16]:
df_il_integrate.to_sql('il_traffic_stops_integrate_year',engine,if_exists='replace')

Should I remove those departments where there are 0 in some field? Maybe not for hits. but for searches and stops, yes

## Connecticut

In [17]:
dbname = 'traffic_stops_ct'
username = 'along528'
pswd = 'password'
con_ct = None
con_ct = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [18]:
df_race_map_ct=defaultdict(list)
for race in ['white','black']:
    query = """
        SELECT agencyname,year,count(agencyid) AS stops_<RACE>,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_<RACE>,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_<RACE>
        FROM stops
        WHERE race = '<RACE>'
        GROUP BY agencyname,year
        """
    query = query.replace('<RACE>',race)
    df_race_map_ct[race] = pd.read_sql(query,con_ct)

In [19]:
query = """
SELECT agencyname,year,count(agencyid) AS stops_total,
SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_total,
SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_total
FROM stops
GROUP BY agencyname,year
"""
df_race_map_ct['total'] = pd.read_sql(query,con_ct)

In [20]:
df_race_map_ct['black']

Unnamed: 0,agencyname,year,stops_black,searches_black,hits_black
0,Plymouth,2015,44,3,0
1,Darien,2014,427,27,17
2,New Britain,2013,162,9,6
3,Weston,2015,11,0,0
4,Milford,2014,548,112,27
5,Groton Long Point,2015,3,0,0
6,Granby,2014,81,1,1
7,Portland,2014,11,1,2
8,Naugatuck,2015,150,11,4
9,Manchester,2014,958,53,30


In [21]:
df_ct = pd.DataFrame(df_race_map_ct['total'])
df_ct = df_ct.merge(df_race_map_ct['white'],how='left')
df_ct = df_ct.merge(df_race_map_ct['black'],how='left')
df_ct = df_ct.fillna(0)
df_ct = df_ct[df_ct['agencyname']!=0]
df_ct

Unnamed: 0,agencyname,year,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
1,Plymouth,2015,845,24,6,794,21,6,44,3,0
2,New Britain,2013,1029,43,25,861,34,19,162,9,6
3,Weston,2015,143,1,1,131,1,1,11,0,0
4,Milford,2014,4358,422,142,3711,309,115,548,112,27
5,Granby,2014,1348,27,26,1258,26,25,81,1,1
6,Portland,2014,162,2,3,150,1,1,11,1,2
7,Naugatuck,2015,1555,96,36,1382,85,32,150,11,4
8,Putnam,2015,295,10,6,280,10,6,13,0,0
9,Derby,0,1,0,0,1,0,0,0,0,0
10,Berlin,2014,6925,290,60,6164,237,48,639,50,10


In [22]:
df_ct.to_sql('ct_traffic_stops_split_by_year_noagencyid',engine,if_exists='replace')

In [23]:
query = """
SELECT 
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM ct_traffic_stops_split_by_year_noagencyid
GROUP BY agencyname;
"""
df_ct_integrate = pd.read_sql(query,con_comb)
df_ct_integrate = df_ct_integrate.reset_index()
#build a unique agency id 
df_ct_integrate.rename(columns=lambda x: x.replace('index','agencyid'),inplace=True)
df_ct_integrate

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,0,Manchester,5474,203,125,3937,126,78,1357,76,43
1,1,Darien,4944,144,62,4242,110,44,564,33,18
2,2,Derby,5000,437,18,4228,346,17,709,89,1
3,3,Norwich,9817,625,184,7339,433,127,2008,184,56
4,4,Groton Town,8100,151,83,6843,115,64,1065,35,18
5,5,North Branford,1892,22,10,1781,20,10,93,2,0
6,6,Fairfield,8194,198,100,6986,152,82,1111,46,18
7,7,WCSU,49,1,0,40,1,0,6,0,0
8,8,New Milford,5493,104,56,5182,91,50,211,10,5
9,9,Middlebury,313,3,0,299,2,0,10,1,0


In [24]:
df_ct_integrate.to_sql('ct_traffic_stops_integrate_year',engine,if_exists='replace')

In [25]:
ct_id_map = df_ct_integrate[['agencyname','agencyid']].set_index('agencyname')
df_ct['agencyid'] = df_ct['agencyname'].map(ct_id_map['agencyid'])
df_ct

Unnamed: 0,agencyname,year,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black,agencyid
1,Plymouth,2015,845,24,6,794,21,6,44,3,0,24
2,New Britain,2013,1029,43,25,861,34,19,162,9,6,35
3,Weston,2015,143,1,1,131,1,1,11,0,0,23
4,Milford,2014,4358,422,142,3711,309,115,548,112,27,54
5,Granby,2014,1348,27,26,1258,26,25,81,1,1,55
6,Portland,2014,162,2,3,150,1,1,11,1,2,53
7,Naugatuck,2015,1555,96,36,1382,85,32,150,11,4,21
8,Putnam,2015,295,10,6,280,10,6,13,0,0,19
9,Derby,0,1,0,0,1,0,0,0,0,0,2
10,Berlin,2014,6925,290,60,6164,237,48,639,50,10,60


In [26]:
df_ct.to_sql('ct_traffic_stops_split_by_year',engine,if_exists='replace')

## North Carolina

In [27]:
#already pre-processed nc data
#note that these only include those already matched to the survey data
#unlike IL and CT which have not yet been matched
dbname = 'traffic_police_combined' 
username = 'along528'
pswd = 'password'
con_nc = None
con_nc = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [28]:
dbname = 'traffic_stops_nc'
username = 'along528'
pswd = 'password'
con_nc2 = None
con_nc2 = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [29]:
df_nc_agency = pd.read_sql('SELECT * FROM nc_agency',con_nc2)
df_nc_agency = df_nc_agency.set_index('id',drop=True)
df_nc_agency[df_nc_agency['name'].str.contains('A&T')]
df_nc_agency

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
1,Aberdeen Police Department
2,Alamance County Sheriff's Office
3,Albemarle Police Department
4,Alexander County Sheriff's Office
5,Alleghany County Sheriff's Office
6,Andrews Police Department
7,Anson County Sheriff's Office
8,Apex Police Department
9,Appalachian State University Police Department
10,Archdale Police Department


In [30]:
frames_new = {}
frames_new_sumyears = {}
for tag in ["stops","hits","searches","force"]: 
    sql_query = "SELECT * FROM %s_by_race;" % (tag)
    frames_new[tag] = pd.read_sql_query(sql_query,con_nc)
    #have same column names for easily building ratios
    #frames_new[tag].rename(columns=lambda x: x.replace(tag+'_',''),inplace=True)
    frames_new[tag].rename(columns=lambda x: x.replace('agency_id','agencyid'),inplace=True)
    frames_new[tag]['agencyname'] =  frames_new[tag]['agencyid'].map(df_nc_agency['name'])

    #sum by year
    frames_new_sumyears[tag] = frames_new[tag].groupby(['agencyid']).sum().drop(['index','year'],1)
    frames_new_sumyears[tag]['agencyname'] =  frames_new[tag]['agencyid'].map(df_nc_agency['name'])
   

In [31]:
#frames_new['stops'][frames_new['stops']['agencyid']==105]
frames_new['searches'][frames_new['searches']['agencyid']==182]


Unnamed: 0,index,year,searches_asian,searches_black,searches_native_american,searches_other,searches_white,agencyid,searches_total,agencyname
759,759,2002,0,1,2,1,48,182,234,Murphy Police Department
760,760,2003,0,1,0,3,40,182,226,Murphy Police Department
761,761,2004,0,1,0,0,16,182,199,Murphy Police Department
762,762,2005,0,0,0,0,27,182,209,Murphy Police Department
763,763,2006,0,3,0,5,35,182,225,Murphy Police Department
764,764,2008,0,0,0,0,1,182,183,Murphy Police Department
765,765,2009,1,4,0,1,22,182,210,Murphy Police Department
766,766,2010,0,1,0,0,33,182,216,Murphy Police Department
767,767,2011,0,1,0,0,20,182,203,Murphy Police Department
768,768,2012,0,1,0,0,16,182,199,Murphy Police Department


In [32]:
tmpjoin_stops_searches = frames_new['stops'].merge(frames_new['searches'],how='right',on=['agencyid','year'])
tmpjoin_stops_searches_hits = tmpjoin_stops_searches.merge(frames_new['hits'],how='right',on=['agencyid','year'])
df_nc = tmpjoin_stops_searches_hits.drop(['agencyname_x','agencyname_y','index_x','index_y','index'],1)
df_nc = df_nc.fillna(0)
df_nc

Unnamed: 0,year,stops_asian,stops_black,stops_native_american,stops_other,stops_white,agencyid,stops_total,searches_asian,searches_black,...,searches_other,searches_white,searches_total,hits_asian,hits_black,hits_native_american,hits_other,hits_white,hits_total,agencyname
0,2002,1,367,1,51,532,2,954,0,95,...,17,154,269,0,18,1,3,25,49,Alamance County Sheriff's Office
1,2003,3,155,1,67,423,2,651,0,23,...,14,87,127,0,4,0,2,15,23,Alamance County Sheriff's Office
2,2004,15,508,5,226,969,2,1725,1,81,...,66,155,305,0,31,0,7,38,78,Alamance County Sheriff's Office
3,2005,1,147,0,28,300,2,478,0,24,...,7,43,76,0,7,0,1,11,21,Alamance County Sheriff's Office
4,2006,2,75,1,33,257,2,370,0,3,...,2,11,18,0,0,0,1,4,7,Alamance County Sheriff's Office
5,2007,3,113,0,28,280,2,426,0,9,...,6,19,36,0,1,0,0,5,8,Alamance County Sheriff's Office
6,2008,4,182,0,11,501,2,700,1,12,...,2,45,62,0,2,0,0,11,15,Alamance County Sheriff's Office
7,2009,16,861,0,12,1870,2,2761,2,139,...,4,300,447,1,43,0,1,77,124,Alamance County Sheriff's Office
8,2010,17,1311,23,11,2988,2,4352,0,209,...,1,398,615,0,76,0,0,129,207,Alamance County Sheriff's Office
9,2011,15,1076,12,8,2585,2,3698,1,167,...,0,316,486,1,50,0,0,92,145,Alamance County Sheriff's Office


In [33]:
df_nc.to_sql('nc_traffic_stops_split_by_year',engine,if_exists='replace')

In [34]:
query = """
SELECT agencyid,
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM nc_traffic_stops_split_by_year_test
GROUP BY agencyid,agencyname;
"""
df_nc_integrate = pd.read_sql(query,con_comb)
df_nc_integrate[df_nc_integrate['agencyid']==182]

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
81,182,Murphy Police Department,9931,2702,2514,7222,303,136,193,17,9


In [35]:
df_nc_integrate.to_sql('nc_traffic_stops_integrate_year',engine,if_exists='replace')

## Combine all three

In [36]:
df_il_integrate['agencyid'] = df_il_integrate['agencyid'].map(lambda x: "IL"+str(x))
df_il_integrate['state'] = 'IL'
df_nc_integrate['agencyid'] = df_nc_integrate['agencyid'].map(lambda x: "NC"+str(x))
df_nc_integrate['state'] = 'NC'
df_ct_integrate['agencyid'] = df_ct_integrate['agencyid'].map(lambda x: "CT"+str(x))
df_ct_integrate['state'] = 'CT'

In [37]:
df_comb = pd.concat([df_il_integrate,df_nc_integrate,df_ct_integrate])
df_comb

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black,state
0,IL13228,Malta Police,2734,13,2,1810,3,1,574,6,0,IL
1,IL13689,Meredosia Police,1971,22,7,1876,21,7,52,0,0,IL
2,IL13981,Cherry Valley Police,11735,590,312,8692,309,166,1430,113,43,IL
3,IL13636,Toluca Police,100,1,0,92,1,0,3,0,0,IL
4,IL13756,Rock Island Police,57484,5873,1040,34975,3082,538,19253,2484,468,IL
5,IL13661,Mercer County Sheriff,6096,1456,100,5406,1276,86,270,79,7,IL
6,IL13843,Cowden Police,590,21,3,581,21,3,5,0,0,IL
7,IL13604,Wilsonville Police,371,17,10,367,17,10,1,0,0,IL
8,IL14015,Sycamore Police,22802,64,21,19188,45,17,1695,8,2,IL
9,IL13717,Monticello Police,6608,253,55,5983,174,41,420,50,12,IL


In [39]:
df_comb.to_sql('combined_traffic_stops_integrate_year',engine,if_exists='replace')