# Produce combined table with features and labels

In [1]:
from glob import glob
import sets
import datetime
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
from collections import defaultdict

  from ipykernel import kernelapp as app


In [2]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/combined_profiling


In [3]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
con_comb = None
con_comb = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [4]:
query="""
SELECT traffic.*, census.*,survey.*
FROM combined_traffic_stops_integrate_year_surveyid traffic
JOIN census_population_sum census ON traffic.surveyid = census.surveyid
JOIN police_surveys survey ON census.surveyid = survey.surveyid;
"""
joined_db = pd.read_sql(query,con_comb)
#get rid of duplicate surveyid columns, but leave one
surveyid_series = joined_db['surveyid'].ix[:,0]
joined_db = joined_db.drop('surveyid',axis=1)
joined_db['surveyid'] = surveyid_series
#get rid of duplicate state columns, but leave one
state_series = joined_db['state'].ix[:,0]
joined_db = joined_db.drop('state',axis=1)
joined_db['state'] = state_series
#drop index
joined_db = joined_db.drop('index',axis=1)
#grab string features we want
string_features = joined_db[['surveyid','agency','city','state','zipcode']]
#drop those plust some we don't want
joined_db = joined_db.drop(['agencyname','state',
                             'resptype','formtype','city','agency',
                             'interpdesc','othpatdesc','othcpdesc',
                             'terrprepdesc','secgundesc','othimpdesc',
                             'othchemdesc','othactdesc','othaccdesc','ori',
                             'surveyid','zipcode'],axis=1)
#convert everything else to numeric features
joined_db = joined_db.apply(lambda x: pd.to_numeric(x, errors='ignore'))
#then put the numeric and desired string features back
joined_db = pd.concat([string_features,joined_db],axis=1)
#prettify the agency name
joined_db['agency'] = joined_db['agency'].map(str.title)
joined_db

Unnamed: 0,surveyid,agency,city,state,zipcode,stops_total,searches_total,hits_total,stops_white,searches_white,...,impdrugtask,impgangtask,imphumntask,impterrtask,impcell,csllea04_id,population,lpdsampgrp,finalwt_page1,finalwt_page2on
0,167,Bridgeport Police Department,Bridgeport,CT,6604,6553,700,69,3881,335,...,0,0,0,0,3,10032119,137655,100,1.049730,0.000000
1,1212,New London Police Department,New London,CT,6320,2188,197,68,1755,150,...,0,0,0,0,9,10032166,25890,1,2.207370,2.217590
2,1222,Westport Police Department,Westport,CT,6880,9648,336,128,8463,242,...,0,0,0,0,9,10032208,26704,1,2.207370,2.217590
3,184,Waterbury Police Department,Waterbury,CT,6702,2673,676,131,1832,417,...,0,0,0,0,3,10032202,107241,100,1.049730,1.059140
4,175,Hartford Police Department,Hartford,CT,6120,10980,182,64,6526,85,...,0,0,0,0,3,10032148,124558,100,1.049730,1.059140
5,1223,Wethersfield Police Department,Wethersfield,CT,6109,7707,467,186,6117,352,...,0,0,0,0,11,10032209,25977,2,3.490480,3.539640
6,1214,Plainfield Police Department,Plainfield,CT,6374,1764,34,10,1708,30,...,0,0,0,0,14,10032177,15538,4,7.480850,7.545060
7,170,Danbury Police Department,Danbury,CT,6810,7757,497,34,6992,445,...,0,0,0,0,5,10032128,79893,100,1.049730,1.059140
8,174,Hamden Police Department,Hamden,CT,6518,7772,149,20,4796,61,...,0,0,0,0,5,10032147,57982,100,1.049730,1.059140
9,179,New Britain Police Department,New Britain,CT,6051,10035,452,231,8020,330,...,0,0,0,0,5,10032161,70630,100,1.049730,1.059140


In [5]:
def clean_zipcode(zipcode):
    zipcode = str(int(zipcode))
    #should be five digits
    length = len(zipcode)
    missing = 5 - length
    zipcode = missing*"0" + zipcode
    return zipcode
joined_db['zipcode'] = joined_db['zipcode'].map(clean_zipcode)    

It seems there are a few duplicate departments for some reason. Just drop them for now

In [6]:
duplicates = []
for agency in joined_db['agency'].unique():
    departments = joined_db[joined_db['agency']==agency]
    if len(departments)>1:
        print agency,len(departments)
        duplicates.append(agency)

Avon Police Department 2
East St Louis Police Department 2
Quincy Police Department 2
East Hazel Crest Police Dept 3
Dekalb Police Dept 2
Dupage County Sheriffs Office 2
Union County Sheriffs Office 2
Cook County Sheriffs Office 3


In [7]:
joined_db = joined_db[joined_db['agency'].isin(duplicates)==False]
joined_db

Unnamed: 0,surveyid,agency,city,state,zipcode,stops_total,searches_total,hits_total,stops_white,searches_white,...,impdrugtask,impgangtask,imphumntask,impterrtask,impcell,csllea04_id,population,lpdsampgrp,finalwt_page1,finalwt_page2on
0,167,Bridgeport Police Department,Bridgeport,CT,06604,6553,700,69,3881,335,...,0,0,0,0,3,10032119,137655,100,1.049730,0.000000
1,1212,New London Police Department,New London,CT,06320,2188,197,68,1755,150,...,0,0,0,0,9,10032166,25890,1,2.207370,2.217590
2,1222,Westport Police Department,Westport,CT,06880,9648,336,128,8463,242,...,0,0,0,0,9,10032208,26704,1,2.207370,2.217590
3,184,Waterbury Police Department,Waterbury,CT,06702,2673,676,131,1832,417,...,0,0,0,0,3,10032202,107241,100,1.049730,1.059140
4,175,Hartford Police Department,Hartford,CT,06120,10980,182,64,6526,85,...,0,0,0,0,3,10032148,124558,100,1.049730,1.059140
5,1223,Wethersfield Police Department,Wethersfield,CT,06109,7707,467,186,6117,352,...,0,0,0,0,11,10032209,25977,2,3.490480,3.539640
6,1214,Plainfield Police Department,Plainfield,CT,06374,1764,34,10,1708,30,...,0,0,0,0,14,10032177,15538,4,7.480850,7.545060
7,170,Danbury Police Department,Danbury,CT,06810,7757,497,34,6992,445,...,0,0,0,0,5,10032128,79893,100,1.049730,1.059140
8,174,Hamden Police Department,Hamden,CT,06518,7772,149,20,4796,61,...,0,0,0,0,5,10032147,57982,100,1.049730,1.059140
9,179,New Britain Police Department,New Britain,CT,06051,10035,452,231,8020,330,...,0,0,0,0,5,10032161,70630,100,1.049730,1.059140


In [8]:
joined_db.to_sql('traffic_joined_with_features',engine,if_exists='replace')

## Also create joined dataset not including traffic

In [9]:
query="""
SELECT survey.*,census.*
FROM census_population_sum census
JOIN police_surveys survey ON census.surveyid = survey.surveyid;
"""
joined_no_traffic_db = pd.read_sql(query,con_comb)
#get rid of duplicate surveyid columns, but leave one
surveyid_series = joined_no_traffic_db['surveyid'].ix[:,0]
joined_no_traffic_db = joined_no_traffic_db.drop('surveyid',axis=1)
joined_no_traffic_db['surveyid'] = surveyid_series

#drop index
joined_no_traffic_db = joined_no_traffic_db.drop('index',axis=1)
#grab string features we want
string_features = joined_no_traffic_db[['surveyid','agency','city','state','zipcode']]
#drop those plust some we don't want
joined_no_traffic_db = joined_no_traffic_db.drop(['state',
                             'resptype','formtype','city','agency',
                             'interpdesc','othpatdesc','othcpdesc',
                             'terrprepdesc','secgundesc','othimpdesc',
                             'othchemdesc','othactdesc','othaccdesc','ori',
                             'surveyid','zipcode'],axis=1)
#convert everything else to numeric features
joined_no_traffic_db = joined_no_traffic_db.apply(lambda x: pd.to_numeric(x, errors='ignore'))
#then put the numeric and desired string features back
joined_no_traffic_db = pd.concat([string_features,joined_no_traffic_db],axis=1)
#prettify the agency name
joined_no_traffic_db['agency'] = joined_no_traffic_db['agency'].map(str.title)
joined_no_traffic_db

Unnamed: 0,surveyid,agency,city,state,zipcode,agcytype,swnauthemp,swnftemp,swnptemp,civftemp,...,population_white,population_black,population_native_american,population_asian,population_nathaw,population_other_race,population_mult_race,total_income_estimate_all,total_income_estimate_white,total_income_estimate_black
0,2,Anchorage Police Department,Anchorage,AK,99507,3,399,385,0,174,...,151441,14039,21549,22442,5724,6154,20688,88288,63011,5501
1,1001,Ketchikan Police Department,Ketchikan,AK,99901,3,23,23,0,10,...,9204,79,1912,943,27,93,1250,5305,4025,23
2,1003,Valdez Police Department,Valdez,AK,99686,3,11,11,0,9,...,3270,24,326,76,32,26,251,1301,1000,0
3,167,Bridgeport Police Department,Bridgeport,CT,6604,3,521,423,0,102,...,57059,49829,789,4917,151,25194,6256,50034,23757,17337
4,16,Tuscaloosa Police Department,Tuscaloosa,AL,35401,3,269,264,0,75,...,62416,45082,262,1883,23,1767,1221,39364,22646,15603
5,13,Mobile Police Department,Mobile,AL,36606,3,532,526,0,254,...,136887,121541,948,4901,141,2453,3841,104285,57283,43754
6,1004,Anniston Police Department,Anniston,AL,36202,3,92,81,0,33,...,48199,18934,282,618,28,1618,1139,27258,18677,7732
7,5,Birmingham Police Department,Birmingham,AL,35203,3,914,768,0,400,...,248229,216662,1109,10220,194,11592,5800,200127,105638,85888
8,15,Montgomery Police Department,Montgomery,AL,36101,3,510,491,0,97,...,81206,120864,535,4643,166,4545,2744,82722,34348,45439
9,7,Dothan Police Department,Dothan,AL,36303,3,157,148,0,100,...,53580,23271,297,764,45,1071,1418,31759,22364,8524


In [10]:
joined_no_traffic_db.to_sql('all_pd_joined_features',engine,if_exists='replace')