# Combine traffic data in similar format

In [27]:
from glob import glob
import sets
import datetime
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
from collections import defaultdict

In [56]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/combined_profiling


In [61]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
con_comb = None
con_comb = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

## Illinois traffic data

In [3]:
dbname = 'traffic_stops_il'
username = 'along528'
pswd = 'password'
con_il = None
con_il = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [32]:
df_race_map = defaultdict(list)
for race in ['black','white']:
    for year in range(2004,2015):
        query = """
        SELECT agencycode,agencyname,count(agencycode) AS stops_<RACE>,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_<RACE>,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_<RACE>,
        year 
        FROM traffic_stops_<YEAR>
        WHERE race = '<RACE>'
        GROUP BY agencycode,agencyname,year
        """
        query = query.replace('<RACE>',race)
        query = query.replace('<YEAR>',str(year))
        df_race_map[race].append(pd.read_sql(query,con_il))


In [34]:
for year in range(2004,2015):
        query = """
        SELECT agencycode,agencyname,count(agencycode) AS stops_total,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_total,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_total,
        year 
        FROM traffic_stops_<YEAR>
        GROUP BY agencycode,agencyname,year
        """
        query = query.replace('<YEAR>',str(year))
        df_race_map['total'].append(pd.read_sql(query,con_il))

In [35]:
dfs={}
for key in df_race_map:
    dfs[key] = pd.concat(df_race_map[key])


In [45]:
for key in dfs:
    print key,len(dfs[key])

white 9835
total 9854
black 8483


In [52]:
dfs['total']

Unnamed: 0,agencycode,agencyname,stops_total,searches_total,hits_total,year
0,10011,Dolton Police Department,1025,16,2,2004
1,12987,South Suburban College Police,49,2,0,2004
2,12988,South Holland Police,4671,368,58,2004
3,12989,South Chicago Heights Police,1850,84,0,2004
4,12990,Skokie Police,10376,560,0,2004
5,12991,Schiller Park Police,3116,126,21,2004
6,12992,Schaumburg Police,14799,449,29,2004
7,12993,Sauk Village Police,1990,80,20,2004
8,12994,Rosemont Police,3806,256,19,2004
9,12995,Rolling Meadows Police,7555,482,16,2004


In [55]:
df_il = pd.DataFrame(dfs['total'])
df_il = df_il.merge(dfs['white'],how='left')
df_il = df_il.merge(dfs['black'],how='left')
df_il

Unnamed: 0,agencycode,agencyname,stops_total,searches_total,hits_total,year,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,10011,Dolton Police Department,1025,16,2,2004,101,0,0,891,16,2
1,12987,South Suburban College Police,49,2,0,2004,4,0,0,40,1,0
2,12988,South Holland Police,4671,368,58,2004,646,47,5,3666,287,51
3,12989,South Chicago Heights Police,1850,84,0,2004,788,25,0,841,36,0
4,12990,Skokie Police,10376,560,0,2004,6618,233,0,1249,114,0
5,12991,Schiller Park Police,3116,126,21,2004,2019,58,11,284,10,1
6,12992,Schaumburg Police,14799,449,29,2004,11241,276,19,1057,60,4
7,12993,Sauk Village Police,1990,80,20,2004,754,20,9,1043,49,8
8,12994,Rosemont Police,3806,256,19,2004,2486,110,11,455,34,0
9,12995,Rolling Meadows Police,7555,482,16,2004,5221,230,8,478,41,3


In [58]:
df_il.to_sql('il_traffic_stops_split_by_year',engine,if_exists='replace')

In [69]:
query = """
SELECT agencycode,
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM il_traffic_stops_split_by_year
GROUP BY agencycode,agencyname,stops_total,stops_total,stops_total;
"""
df_il_integrate = pd.read_sql(query,con_comb)
df_il_integrate

Unnamed: 0,agencycode,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,13242,Arthur Police,268,8,4,259,7,4,2,1,0
1,13806,Springfield Park District Police,645,51,11,465,23,5,170,28,6
2,13561,Harvard Police,1542,63,21,1104,15,3,37,2,0
3,13876,Deer Creek Police,888,0,0,870,0,0,11,0,0
4,13194,Chicago Police,163895,10552,3193,51133,723,276,68371,6970,2193
5,13796,Alorton Police,2753,1,0,703,0,0,2041,1,0
6,13090,Lanark Police,228,1,0,222,1,0,2,0,0
7,13569,Mclean Police,193,7,2,158,4,2,23,2,0
8,13405,Elizabeth Police,293,0,0,263,0,0,9,0,0
9,14035,Golf Police,826,19,2,496,2,0,108,5,0


In [70]:
df_il_integrate.to_sql('il_traffic_stops_integrate_year',engine,if_exists='replace')