# Combine traffic data in similar format

In [1]:
from glob import glob
import sets
import datetime
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
from collections import defaultdict

  from ipykernel import kernelapp as app


In [2]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/combined_profiling


In [3]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
con_comb = None
con_comb = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

## Illinois traffic data

In [4]:
dbname = 'traffic_stops_il'
username = 'along528'
pswd = 'password'
con_il = None
con_il = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

Even though we have data from 2004 - 2014, the Chicago PD reported 'Yes' for contrabandfound for every stop from 2004-2006. Since this is suspect let's just start from 2007.

In [5]:

il_year_start = 2007 
il_year_stop = 2014
df_race_map = defaultdict(list)
for race in ['black','white']:
    for year in range(il_year_start,il_year_stop+1):
        query = """
        SELECT agencycode,agencyname,count(agencycode) AS stops_<RACE>,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_<RACE>,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_<RACE>,
        year 
        FROM traffic_stops_<YEAR>
        WHERE race = '<RACE>'
        GROUP BY agencycode,agencyname,year
        """
        query = query.replace('<RACE>',race)
        query = query.replace('<YEAR>',str(year))
        df_race_map[race].append(pd.read_sql(query,con_il))


In [6]:
for year in range(il_year_start,il_year_stop+1):
        query = """
        SELECT agencycode,agencyname,count(agencycode) AS stops_total,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_total,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_total,
        year 
        FROM traffic_stops_<YEAR>
        GROUP BY agencycode,agencyname,year
        """
        query = query.replace('<YEAR>',str(year))
        df_race_map['total'].append(pd.read_sql(query,con_il))

In [7]:
dfs={}
for key in df_race_map:
    dfs[key] = pd.concat(df_race_map[key])

In [8]:
for key in dfs:
    print key,len(dfs[key])

white 6919
total 6934
black 5972


In [9]:
dfs['total']

Unnamed: 0,agencycode,agencyname,stops_total,searches_total,hits_total,year
0,10011,Dolton Police Department,681,6,3,2007
1,12987,South Suburban College Police,147,0,0,2007
2,12988,South Holland Police,4633,421,63,2007
3,12989,South Chicago Heights Police,793,15,2,2007
4,12990,Skokie Police,17638,883,101,2007
5,12991,Schiller Park Police,1817,47,7,2007
6,12992,Schaumburg Police,8784,58,4,2007
7,12993,Sauk Village Police,1686,149,43,2007
8,12994,Rosemont Police,810,104,10,2007
9,12995,Rolling Meadows Police,6359,588,75,2007


In [10]:
df_il = pd.DataFrame(dfs['total'])
df_il = df_il.merge(dfs['white'],how='left')
df_il = df_il.merge(dfs['black'],how='left')
df_il = df_il.fillna(0)
df_il.rename(columns=lambda x: x.replace('agencycode','agencyid'),inplace=True)
df_il

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,year,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,10011,Dolton Police Department,681,6,3,2007,59,1,0,603,5,3
1,12987,South Suburban College Police,147,0,0,2007,24,0,0,111,0,0
2,12988,South Holland Police,4633,421,63,2007,527,22,5,3837,362,53
3,12989,South Chicago Heights Police,793,15,2,2007,227,6,1,458,4,1
4,12990,Skokie Police,17638,883,101,2007,11743,329,52,2086,216,27
5,12991,Schiller Park Police,1817,47,7,2007,1302,19,2,168,3,1
6,12992,Schaumburg Police,8784,58,4,2007,6490,32,2,707,14,1
7,12993,Sauk Village Police,1686,149,43,2007,525,40,15,999,92,25
8,12994,Rosemont Police,810,104,10,2007,496,43,5,90,13,2
9,12995,Rolling Meadows Police,6359,588,75,2007,4350,245,29,501,51,10


In [11]:
df_il.to_sql('il_traffic_stops_split_by_year',engine,if_exists='replace')

In [65]:
query = """
SELECT agencyid,
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM il_traffic_stops_split_by_year
GROUP BY agencyid,agencyname;
"""
df_il_integrate = pd.read_sql(query,con_comb)
df_il_integrate

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,13228,Malta Police,2734,13,2,1810,3,1,574,6,0
1,13689,Meredosia Police,1971,22,7,1876,21,7,52,0,0
2,13981,Cherry Valley Police,11735,590,312,8692,309,166,1430,113,43
3,13636,Toluca Police,100,1,0,92,1,0,3,0,0
4,13756,Rock Island Police,57484,5873,1040,34975,3082,538,19253,2484,468
5,13661,Mercer County Sheriff,6096,1456,100,5406,1276,86,270,79,7
6,13843,Cowden Police,590,21,3,581,21,3,5,0,0
7,13604,Wilsonville Police,371,17,10,367,17,10,1,0,0
8,14015,Sycamore Police,22802,64,21,19188,45,17,1695,8,2
9,13717,Monticello Police,6608,253,55,5983,174,41,420,50,12


In [13]:
df_il_integrate.to_sql('il_traffic_stops_integrate_year',engine,if_exists='replace')

Should I remove those departments where there are 0 in some field? Maybe not for hits. but for searches and stops, yes

## Connecticut

In [14]:
dbname = 'traffic_stops_ct'
username = 'along528'
pswd = 'password'
con_ct = None
con_ct = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [16]:
df_race_map_ct=defaultdict(list)
for race in ['white','black']:
    query = """
        SELECT agencyid,agencyname,year,count(agencyid) AS stops_<RACE>,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_<RACE>,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_<RACE>
        FROM stops
        WHERE race = '<RACE>'
        GROUP BY agencyid,agencyname,year
        """
    query = query.replace('<RACE>',race)
    df_race_map_ct[race] = pd.read_sql(query,con_ct)

In [17]:
query = """
SELECT agencyid,agencyname,year,count(agencyid) AS stops_total,
SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_total,
SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_total
FROM stops
GROUP BY agencyid,agencyname,year
"""
df_race_map_ct['total'] = pd.read_sql(query,con_ct)

In [18]:
df_race_map_ct['black']

Unnamed: 0,agencyid,agencyname,year,stops_black,searches_black,hits_black
0,CT0004700,East Windsor,2014,135,7,1
1,CT0013500,Stamford,2013,332,18,2
2,CT0008800,Naugatuck,2014,730,58,18
3,CT0005900,Groton City,2015,181,5,4
4,CT0015201,Waterford,2015,291,15,6
5,CT0015800,Westport,2014,256,20,10
6,CT0005700,Greenwich,2014,694,23,4
7,CT0007700,Manchester,2014,957,53,30
8,CT0007600,Madison,2015,15,0,0
9,CT0011000,Plainville,2013,93,6,0


In [19]:
df_ct = pd.DataFrame(df_race_map_ct['total'])
df_ct = df_ct.merge(df_race_map_ct['white'],how='left')
df_ct = df_ct.merge(df_race_map_ct['black'],how='left')
df_ct = df_ct.fillna(0)
df_ct = df_ct[df_ct['agencyid']!=0]
df_ct

Unnamed: 0,agencyid,agencyname,year,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,CT0000200,Ansonia,2013,733,12,1,616,10,0,112,2,1
1,CT0000200,Ansonia,2014,4634,86,16,3865,67,10,734,19,6
2,CT0000200,Ansonia,2015,1701,67,10,1384,55,9,293,12,1
3,CT0000400,Avon,2014,811,8,8,728,8,8,69,0,0
4,CT0000400,Avon,2015,417,6,5,375,5,5,28,1,0
5,CT0000700,Berlin,2013,826,56,10,745,47,9,79,9,1
6,CT0000700,Berlin,2014,6925,290,60,6164,237,48,639,50,10
7,CT0000700,Berlin,2015,1215,39,19,1056,35,19,132,2,0
8,CT0000900,Bethel,2014,1801,22,15,1675,19,12,88,3,3
9,CT0000900,Bethel,2015,352,6,5,336,6,5,11,0,0


In [20]:
df_ct.to_sql('ct_traffic_stops_split_by_year',engine,if_exists='replace')

In [66]:
query = """
SELECT agencyid,
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM ct_traffic_stops_split_by_year
GROUP BY agencyid,agencyname;
"""
df_ct_integrate = pd.read_sql(query,con_comb)
df_ct_integrate

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,CTCSP0600,Plymouth,1,0,0,1,0,0,0,0,0
1,CT0010400,Norwich,1,1,1,1,1,1,0,0,0
2,CT0006016,Guilford,203,2,0,195,2,0,6,0,0
3,CT0013700,Stonington,2960,15,8,2788,13,6,118,2,2
4,CT0014001,Thomaston,3,1,0,3,1,0,0,0,0
5,CTCSP0700,State Police,37024,307,123,32497,228,101,3594,79,22
6,CT0011800,Ridgefield,10971,46,12,10294,43,12,406,3,0
7,CT0009301,New Haven,2343,142,48,1182,46,22,1122,96,26
8,CT0011100,Manchester,2,0,0,1,0,0,1,0,0
9,CT0004300,East Hartford,10990,405,210,6712,208,106,4100,196,103


In [25]:
df_ct_integrate.to_sql('ct_traffic_stops_integrate_year',engine,if_exists='replace')

## North Carolina

In [71]:
#already pre-processed nc data
#note that these only include those already matched to the survey data
#unlike IL and CT which have not yet been matched
dbname = 'traffic_police_combined' 
username = 'along528'
pswd = 'password'
con_nc = None
con_nc = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [30]:
dbname = 'traffic_stops_nc'
username = 'along528'
pswd = 'password'
con_nc2 = None
con_nc2 = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [31]:
df_nc_agency = pd.read_sql('SELECT * FROM nc_agency',con_nc2)
df_nc_agency

Unnamed: 0,id,name
0,1,Aberdeen Police Department
1,2,Alamance County Sheriff's Office
2,3,Albemarle Police Department
3,4,Alexander County Sheriff's Office
4,5,Alleghany County Sheriff's Office
5,6,Andrews Police Department
6,7,Anson County Sheriff's Office
7,8,Apex Police Department
8,9,Appalachian State University Police Department
9,10,Archdale Police Department


In [45]:
frames_new = {}
frames_new_sumyears = {}
for tag in ["stops","hits","searches","force"]: 
    sql_query = "SELECT * FROM %s_by_race;" % (tag)
    frames_new[tag] = pd.read_sql_query(sql_query,con_nc)
    #have same column names for easily building ratios
    #frames_new[tag].rename(columns=lambda x: x.replace(tag+'_',''),inplace=True)
    frames_new[tag].rename(columns=lambda x: x.replace('agency_id','agencyid'),inplace=True)
    frames_new[tag]['agencyname'] =  frames_new[tag]['agencyid'].map(df_nc_agency['name'])
    #sum by year
    frames_new_sumyears[tag] = frames_new[tag].groupby(['agencyid']).sum().drop(['index','year'],1)
    frames_new_sumyears[tag]['agencyname'] =  frames_new[tag]['agencyid'].map(df_nc_agency['name'])



In [48]:
df_nc = pd.concat([frames_new['stops'][['agencyid','agencyname','year',
                                        'stops_total','stops_white','stops_black']],
                   frames_new['searches'][['searches_total','searches_white','searches_black']],
                   frames_new['hits'][['hits_total','hits_white','hits_black']]],axis=1)
df_nc = df_nc.fillna(0)
df_nc

Unnamed: 0,agencyid,agencyname,year,stops_total,stops_white,stops_black,searches_total,searches_white,searches_black,hits_total,hits_white,hits_black
0,2,Albemarle Police Department,2002,954,532,367,269,154,95,49,25,18
1,2,Albemarle Police Department,2003,651,423,155,127,87,23,23,15,4
2,2,Albemarle Police Department,2004,1725,969,508,305,155,81,78,38,31
3,2,Albemarle Police Department,2005,478,300,147,76,43,24,21,11,7
4,2,Albemarle Police Department,2006,370,257,75,18,11,3,7,4,0
5,2,Albemarle Police Department,2007,426,280,113,36,19,9,8,5,1
6,2,Albemarle Police Department,2008,700,501,182,62,45,12,15,11,2
7,2,Albemarle Police Department,2009,2761,1870,861,447,300,139,124,77,43
8,2,Albemarle Police Department,2010,4352,2988,1311,615,398,209,207,129,76
9,2,Albemarle Police Department,2011,3698,2585,1076,486,316,167,145,92,50


In [49]:
df_nc.to_sql('nc_traffic_stops_split_by_year',engine,if_exists='replace')

In [41]:
frames_new['hits']

Unnamed: 0,index,year,asian,black,native_american,other,white,agencyid,total,agencyname
0,0,2002,0,18,1,3,25,2,49,Albemarle Police Department
1,1,2003,0,4,0,2,15,2,23,Albemarle Police Department
2,2,2004,0,31,0,7,38,2,78,Albemarle Police Department
3,3,2005,0,7,0,1,11,2,21,Albemarle Police Department
4,4,2006,0,0,0,1,4,2,7,Albemarle Police Department
5,5,2007,0,1,0,0,5,2,8,Albemarle Police Department
6,6,2008,0,2,0,0,11,2,15,Albemarle Police Department
7,7,2009,1,43,0,1,77,2,124,Albemarle Police Department
8,8,2010,0,76,0,0,129,2,207,Albemarle Police Department
9,9,2011,1,50,0,0,92,2,145,Albemarle Police Department


In [67]:
query = """
SELECT agencyid,
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM nc_traffic_stops_split_by_year
GROUP BY agencyid,agencyname;
"""
df_nc_integrate = pd.read_sql(query,con_comb)
df_nc_integrate

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,235,Rolesville Police Department,74673,4502,4423,18286,373,283,51730,482,382
1,147,Lenoir Police Department,6940,3280,2916,2786,531,401,2263,473,368
2,180,Murfreesboro Police Department,13280,35165,1980,8230,18858,143,2796,9270,50
3,69,DHHS Police - Black Mountain,45814,5469,1747,39882,2921,353,4009,885,248
4,169,Mitchell County Sheriff's Office,25249,3570,2735,15285,628,168,5304,448,82
5,68,Currituck County Sheriff's Office,59640,4588,2356,28990,2699,986,25926,732,262
6,51,Chatham County Sheriff's Office,1601601,34654,1352,722231,9192,373,804263,24027,179
7,123,Hope Mills Police Department,44196,3510,2119,30216,1241,531,10375,811,178
8,294,Washington Police Department,6339,10222,0,1485,2853,0,1213,3251,0
9,49,Chapel Hill Police Department,13218,3161,3434,10322,1593,1044,1339,755,1543


In [51]:
df_nc_integrate.to_sql('nc_traffic_stops_integrate_year',engine,if_exists='replace')

## Combine all three

In [68]:
df_il_integrate['agencyid'] = df_il_integrate['agencyid'].map(lambda x: "IL"+str(x))
df_il_integrate['state'] = 'IL'
df_nc_integrate['agencyid'] = df_nc_integrate['agencyid'].map(lambda x: "NC"+str(x))
df_nc_integrate['state'] = 'NC'
#already has CT tag
#df_ct_integrate['agencyid'] = df_ct_integrate['agencyid'].map(lambda x: "CT"+str(x))
df_ct_integrate['state'] = 'CT'

In [69]:
df_comb = pd.concat([df_il_integrate,df_nc_integrate,df_ct_integrate])
df_comb

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black,state
0,IL13228,Malta Police,2734,13,2,1810,3,1,574,6,0,IL
1,IL13689,Meredosia Police,1971,22,7,1876,21,7,52,0,0,IL
2,IL13981,Cherry Valley Police,11735,590,312,8692,309,166,1430,113,43,IL
3,IL13636,Toluca Police,100,1,0,92,1,0,3,0,0,IL
4,IL13756,Rock Island Police,57484,5873,1040,34975,3082,538,19253,2484,468,IL
5,IL13661,Mercer County Sheriff,6096,1456,100,5406,1276,86,270,79,7,IL
6,IL13843,Cowden Police,590,21,3,581,21,3,5,0,0,IL
7,IL13604,Wilsonville Police,371,17,10,367,17,10,1,0,0,IL
8,IL14015,Sycamore Police,22802,64,21,19188,45,17,1695,8,2,IL
9,IL13717,Monticello Police,6608,253,55,5983,174,41,420,50,12,IL


In [70]:
df_comb.to_sql('combined_traffic_stops_integrate_year',engine,if_exists='replace')