# Combine traffic data in similar format

In [1]:
from glob import glob
import sets
import datetime
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
from collections import defaultdict

  from ipykernel import kernelapp as app


In [2]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/combined_profiling


In [3]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
con_comb = None
con_comb = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

## Illinois traffic data

In [None]:
dbname = 'traffic_stops_il'
username = 'along528'
pswd = 'password'
con_il = None
con_il = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

Even though we have data from 2004 - 2014, the Chicago PD reported 'Yes' for contrabandfound for every stop from 2004-2006. Since this is suspect let's just start from 2007.

In [None]:

il_year_start = 2007 
il_year_stop = 2014
df_race_map = defaultdict(list)
for race in ['black','white']:
    for year in range(il_year_start,il_year_stop+1):
        query = """
        SELECT agencycode,agencyname,count(agencycode) AS stops_<RACE>,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_<RACE>,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_<RACE>,
        year 
        FROM traffic_stops_<YEAR>
        WHERE race = '<RACE>'
        GROUP BY agencycode,agencyname,year
        """
        query = query.replace('<RACE>',race)
        query = query.replace('<YEAR>',str(year))
        df_race_map[race].append(pd.read_sql(query,con_il))


In [None]:
for year in range(il_year_start,il_year_stop+1):
        query = """
        SELECT agencycode,agencyname,count(agencycode) AS stops_total,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_total,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_total,
        year 
        FROM traffic_stops_<YEAR>
        GROUP BY agencycode,agencyname,year
        """
        query = query.replace('<YEAR>',str(year))
        df_race_map['total'].append(pd.read_sql(query,con_il))

In [None]:
dfs={}
for key in df_race_map:
    dfs[key] = pd.concat(df_race_map[key])

In [None]:
for key in dfs:
    print key,len(dfs[key])

In [None]:
dfs['total']

In [None]:
df_il = pd.DataFrame(dfs['total'])
df_il = df_il.merge(dfs['white'],how='left')
df_il = df_il.merge(dfs['black'],how='left')
df_il = df_il.fillna(0)
df_il.rename(columns=lambda x: x.replace('agencycode','agencyid'),inplace=True)
df_il

In [None]:
df_il.to_sql('il_traffic_stops_split_by_year',engine,if_exists='replace')

In [None]:
query = """
SELECT agencyid,
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM il_traffic_stops_split_by_year
GROUP BY agencyid,agencyname;
"""
df_il_integrate = pd.read_sql(query,con_comb)
df_il_integrate

In [None]:
df_il_integrate.to_sql('il_traffic_stops_integrate_year',engine,if_exists='replace')

Should I remove those departments where there are 0 in some field? Maybe not for hits. but for searches and stops, yes

## Connecticut

In [4]:
dbname = 'traffic_stops_ct'
username = 'along528'
pswd = 'password'
con_ct = None
con_ct = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [5]:
df_race_map_ct=defaultdict(list)
for race in ['white','black']:
    query = """
        SELECT agencyname,year,count(agencyid) AS stops_<RACE>,
        SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_<RACE>,
        SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_<RACE>
        FROM stops
        WHERE race = '<RACE>'
        GROUP BY agencyname,year
        """
    query = query.replace('<RACE>',race)
    df_race_map_ct[race] = pd.read_sql(query,con_ct)

In [6]:
query = """
SELECT agencyname,year,count(agencyid) AS stops_total,
SUM(CASE WHEN searchconducted = 'Yes' THEN 1 ELSE 0 END) as searches_total,
SUM(CASE WHEN contrabandfound = 'Yes' THEN 1 ELSE 0 END) as hits_total
FROM stops
GROUP BY agencyname,year
"""
df_race_map_ct['total'] = pd.read_sql(query,con_ct)

In [7]:
df_race_map_ct['black']

Unnamed: 0,agencyname,year,stops_black,searches_black,hits_black
0,Plymouth,2015,44,3,0
1,Darien,2014,427,27,17
2,New Britain,2013,162,9,6
3,Weston,2015,11,0,0
4,Milford,2014,548,112,27
5,Groton Long Point,2015,3,0,0
6,Granby,2014,81,1,1
7,Portland,2014,11,1,2
8,Naugatuck,2015,150,11,4
9,Manchester,2014,958,53,30


In [8]:
df_ct = pd.DataFrame(df_race_map_ct['total'])
df_ct = df_ct.merge(df_race_map_ct['white'],how='left')
df_ct = df_ct.merge(df_race_map_ct['black'],how='left')
df_ct = df_ct.fillna(0)
df_ct = df_ct[df_ct['agencyname']!=0]
df_ct

Unnamed: 0,agencyname,year,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
1,Plymouth,2015,845,24,6,794,21,6,44,3,0
2,New Britain,2013,1029,43,25,861,34,19,162,9,6
3,Weston,2015,143,1,1,131,1,1,11,0,0
4,Milford,2014,4358,422,142,3711,309,115,548,112,27
5,Granby,2014,1348,27,26,1258,26,25,81,1,1
6,Portland,2014,162,2,3,150,1,1,11,1,2
7,Naugatuck,2015,1555,96,36,1382,85,32,150,11,4
8,Putnam,2015,295,10,6,280,10,6,13,0,0
9,Derby,0,1,0,0,1,0,0,0,0,0
10,Berlin,2014,6925,290,60,6164,237,48,639,50,10


In [9]:
df_ct.to_sql('ct_traffic_stops_split_by_year_noagencyid',engine,if_exists='replace')

In [10]:
query = """
SELECT 
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM ct_traffic_stops_split_by_year
GROUP BY agencyname;
"""
df_ct_integrate = pd.read_sql(query,con_comb)
df_ct_integrate = df_ct_integrate.reset_index()
#build a unique agency id 
df_ct_integrate.rename(columns=lambda x: x.replace('index','agencyid'),inplace=True)
df_ct_integrate

Unnamed: 0,agencyid,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black
0,0,Manchester,5474,203,125,3937,126,78,1357,76,43
1,1,Portland,203,4,4,191,3,2,11,1,2
2,2,Granby,1978,38,34,1864,35,33,105,3,1
3,3,Milford,5200,523,207,4410,381,161,668,139,45
4,4,Darien,4944,144,62,4242,110,44,564,33,18
5,5,Derby,5000,437,18,4228,346,17,709,89,1
6,6,Easton,672,8,0,640,8,0,25,0,0
7,7,Norwich,9817,625,184,7339,433,127,2008,184,56
8,8,West Haven,4086,149,18,2997,102,11,1031,47,7
9,9,Groton Town,8100,151,83,6843,115,64,1065,35,18


In [11]:
df_ct_integrate.to_sql('ct_traffic_stops_integrate_year',engine,if_exists='replace')

In [12]:
ct_id_map = df_ct_integrate[['agencyname','agencyid']].set_index('agencyname')
df_ct['agencyid'] = df_ct['agencyname'].map(ct_id_map['agencyid'])
df_ct

Unnamed: 0,agencyname,year,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,hits_black,agencyid
1,Plymouth,2015,845,24,6,794,21,6,44,3,0,50
2,New Britain,2013,1029,43,25,861,34,19,162,9,6,72
3,Weston,2015,143,1,1,131,1,1,11,0,0,49
4,Milford,2014,4358,422,142,3711,309,115,548,112,27,3
5,Granby,2014,1348,27,26,1258,26,25,81,1,1,2
6,Portland,2014,162,2,3,150,1,1,11,1,2,1
7,Naugatuck,2015,1555,96,36,1382,85,32,150,11,4,46
8,Putnam,2015,295,10,6,280,10,6,13,0,0,44
9,Derby,0,1,0,0,1,0,0,0,0,0,5
10,Berlin,2014,6925,290,60,6164,237,48,639,50,10,12


In [None]:
df_ct.to_sql('ct_traffic_stops_split_by_year',engine,if_exists='replace')

## North Carolina

In [None]:
#already pre-processed nc data
#note that these only include those already matched to the survey data
#unlike IL and CT which have not yet been matched
dbname = 'traffic_police_combined' 
username = 'along528'
pswd = 'password'
con_nc = None
con_nc = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [None]:
dbname = 'traffic_stops_nc'
username = 'along528'
pswd = 'password'
con_nc2 = None
con_nc2 = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [None]:
df_nc_agency = pd.read_sql('SELECT * FROM nc_agency',con_nc2)
df_nc_agency

In [None]:
frames_new = {}
frames_new_sumyears = {}
for tag in ["stops","hits","searches","force"]: 
    sql_query = "SELECT * FROM %s_by_race;" % (tag)
    frames_new[tag] = pd.read_sql_query(sql_query,con_nc)
    #have same column names for easily building ratios
    #frames_new[tag].rename(columns=lambda x: x.replace(tag+'_',''),inplace=True)
    frames_new[tag].rename(columns=lambda x: x.replace('agency_id','agencyid'),inplace=True)
    frames_new[tag]['agencyname'] =  frames_new[tag]['agencyid'].map(df_nc_agency['name'])
    #sum by year
    frames_new_sumyears[tag] = frames_new[tag].groupby(['agencyid']).sum().drop(['index','year'],1)
    frames_new_sumyears[tag]['agencyname'] =  frames_new[tag]['agencyid'].map(df_nc_agency['name'])



In [None]:
df_nc = pd.concat([frames_new['stops'][['agencyid','agencyname','year',
                                        'stops_total','stops_white','stops_black']],
                   frames_new['searches'][['searches_total','searches_white','searches_black']],
                   frames_new['hits'][['hits_total','hits_white','hits_black']]],axis=1)
df_nc = df_nc.fillna(0)
df_nc

In [None]:
df_nc.to_sql('nc_traffic_stops_split_by_year',engine,if_exists='replace')

In [None]:
frames_new['hits']

In [None]:
query = """
SELECT agencyid,
agencyname,
SUM(stops_total) as stops_total,
SUM(searches_total) as searches_total,
SUM(hits_total) as hits_total,
SUM(stops_white) as stops_white,
SUM(searches_white) as searches_white,
SUM(hits_white) as hits_white,
SUM(stops_black) as stops_black,
SUM(searches_black) as searches_black,
SUM(hits_black) as hits_black
FROM nc_traffic_stops_split_by_year
GROUP BY agencyid,agencyname;
"""
df_nc_integrate = pd.read_sql(query,con_comb)
df_nc_integrate

In [None]:
df_nc_integrate.to_sql('nc_traffic_stops_integrate_year',engine,if_exists='replace')

## Combine all three

In [None]:
df_il_integrate['agencyid'] = df_il_integrate['agencyid'].map(lambda x: "IL"+str(x))
df_il_integrate['state'] = 'IL'
df_nc_integrate['agencyid'] = df_nc_integrate['agencyid'].map(lambda x: "NC"+str(x))
df_nc_integrate['state'] = 'NC'
#already has CT tag
#df_ct_integrate['agencyid'] = df_ct_integrate['agencyid'].map(lambda x: "CT"+str(x))
df_ct_integrate['state'] = 'CT'

In [None]:
df_comb = pd.concat([df_il_integrate,df_nc_integrate,df_ct_integrate])
df_comb

In [None]:
df_comb.to_sql('combined_traffic_stops_integrate_year',engine,if_exists='replace')