# Produce combined table with features and labels

In [1]:
from glob import glob
import sets
import datetime
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
from collections import defaultdict

  from ipykernel import kernelapp as app


In [2]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

postgresql://along528:password@localhost/combined_profiling


In [3]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
con_comb = None
con_comb = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [6]:
query="""
SELECT traffic.*, census.*,survey.*,crimes.*
FROM combined_traffic_stops_integrate_year_surveyid traffic
JOIN census_population_sum census ON traffic.surveyid = census.surveyid
JOIN police_surveys survey ON census.surveyid = survey.surveyid
JOIN crimes ON survey.surveyid = crimes.surveyid;
"""
joined_db = pd.read_sql(query,con_comb)
joined_db

Unnamed: 0,index,agencyname,stops_total,searches_total,hits_total,stops_white,searches_white,hits_white,stops_black,searches_black,...,violent_crime_total,murder_and_nonnegligent_manslaughter,forcible_rape,robbery,aggravated_assault,property_crime_total,burglary,larceny_theft,motor_vehicle_theft,surveyid
0,1144,Alamance County Sheriff's Office,32934,4489,1421,22173,2721,803,10007,1596,...,116,1,7,8,100,1033,429,526,78,562
1,1110,Asheboro Police Department,52566,2261,760,43914,1614,458,7476,441,...,77,1,13,27,36,1798,404,1342,52,2095
2,1132,Asheville Police Department,114953,9278,2801,91816,5973,1759,20944,2939,...,423,6,33,141,243,4910,742,3832,336,563
3,1172,Brunswick County Sheriff's Office,18665,2324,1012,13128,1264,405,3850,528,...,108,1,20,17,70,1825,684,1052,89,2103
4,1133,Burlington Police Department,110905,7487,2295,66999,3656,844,40326,3092,...,406,2,17,105,282,2792,671,1998,123,565
5,1156,Cabarrus County Sheriff's Office,27334,1653,783,20355,909,267,5196,227,...,39,3,7,9,20,910,343,536,31,566
6,1194,Carteret County Sheriff's Office,1806,543,477,1191,89,34,137,12,...,87,3,20,6,58,774,246,480,48,2106
7,1159,Cary Police Department,237210,4655,1636,170267,2845,658,43728,1005,...,115,0,11,29,75,1909,370,1476,63,567
8,1113,Catawba County Sheriff's Office,13218,1712,958,10322,781,187,1339,140,...,132,9,7,22,94,1803,737,991,75,568
9,1189,Chapel Hill Police Department,82922,3723,1538,56579,1705,420,20138,1251,...,87,2,10,30,45,1493,452,969,72,569


In [9]:
joined_db['agency']

Unnamed: 0,agency,agency.1
0,ALAMANCE COUNTY SHERIFF'S OFFICE,Alamance County Sheriff Department
1,ASHEBORO POLICE DEPARTMENT,Asheboro Police Dept
2,ASHEVILLE POLICE DEPARTMENT,Asheville Police Dept
3,BRUNSWICK COUNTY SHERIFFS OFFICE,Brunswick County Sheriff Department
4,BURLINGTON POLICE DEPARTMENT,Burlington Police Dept
5,CABARRUS COUNTY SHERIFFS OFFICE,Cabarrus County Sheriff Office
6,CARTERET COUNTY SHERIFF,Carteret County Sheriff Office
7,CARY POLICE DEPARTMENT,Cary Police Dept
8,CATAWBA COUNTY SHERIFF OFFICE,Catawba County Sheriff Department
9,CHAPEL HILL POLICE DEPARTMENT,Chapel Hill Police Dept


In [10]:
query="""
SELECT traffic.*, census.*,survey.*,crimes.*
FROM combined_traffic_stops_integrate_year_surveyid traffic
JOIN census_population_sum census ON traffic.surveyid = census.surveyid
JOIN police_surveys survey ON census.surveyid = survey.surveyid
JOIN crimes ON survey.surveyid = crimes.surveyid;
"""
joined_db = pd.read_sql(query,con_comb)
#get rid of duplicate agency columns, but leave one
agency_series = joined_db['agency'].ix[:,0]
joined_db = joined_db.drop('agency',axis=1)
joined_db['agency'] = agency_series
#get rid of duplicate surveyid columns, but leave one
surveyid_series = joined_db['surveyid'].ix[:,0]
joined_db = joined_db.drop('surveyid',axis=1)
joined_db['surveyid'] = surveyid_series
#get rid of duplicate state columns, but leave one
state_series = joined_db['state'].ix[:,0]
joined_db = joined_db.drop('state',axis=1)
joined_db['state'] = state_series
#drop index
joined_db = joined_db.drop('index',axis=1)
#grab string features we want
string_features = joined_db[['surveyid','agency','city','state','zipcode']]
#drop those plust some we don't want
joined_db = joined_db.drop(['agencyname','state',
                             'resptype','formtype','city','agency',
                             'interpdesc','othpatdesc','othcpdesc',
                             'terrprepdesc','secgundesc','othimpdesc',
                             'othchemdesc','othactdesc','othaccdesc','ori',
                             'surveyid','zipcode'],axis=1)
#convert everything else to numeric features
joined_db = joined_db.apply(lambda x: pd.to_numeric(x, errors='ignore'))
#then put the numeric and desired string features back
joined_db = pd.concat([string_features,joined_db],axis=1)
#prettify the agency name
joined_db['agency'] = joined_db['agency'].map(str.title)
joined_db

Unnamed: 0,surveyid,agency,city,state,zipcode,stops_total,searches_total,hits_total,stops_white,searches_white,...,finalwt_page2on,violent_crime_total,murder_and_nonnegligent_manslaughter,forcible_rape,robbery,aggravated_assault,property_crime_total,burglary,larceny_theft,motor_vehicle_theft
0,562,Alamance County Sheriff'S Office,Graham,NC,27253,32934,4489,1421,22173,2721,...,1.123188,116,1,7,8,100,1033,429,526,78
1,2095,Asheboro Police Department,Asheboro,NC,27204,52566,2261,760,43914,1614,...,2.217590,77,1,13,27,36,1798,404,1342,52
2,563,Asheville Police Department,Asheville,NC,28802,114953,9278,2801,91816,5973,...,1.059140,423,6,33,141,243,4910,742,3832,336
3,2103,Brunswick County Sheriffs Office,Bolivia,NC,28422,18665,2324,1012,13128,1264,...,4.903811,108,1,20,17,70,1825,684,1052,89
4,565,Burlington Police Department,Burlington,NC,27216,110905,7487,2295,66999,3656,...,1.059140,406,2,17,105,282,2792,671,1998,123
5,566,Cabarrus County Sheriffs Office,Concord,NC,28026,27334,1653,783,20355,909,...,1.123188,39,3,7,9,20,910,343,536,31
6,2106,Carteret County Sheriff,Beaufort,NC,28516,1806,543,477,1191,89,...,4.903811,87,3,20,6,58,774,246,480,48
7,567,Cary Police Department,Cary,NC,27512,237210,4655,1636,170267,2845,...,1.059140,115,0,11,29,75,1909,370,1476,63
8,568,Catawba County Sheriff Office,Newton,NC,28658,13218,1712,958,10322,781,...,1.123188,132,9,7,22,94,1803,737,991,75
9,569,Chapel Hill Police Department,Chapel Hill,NC,27514,82922,3723,1538,56579,1705,...,1.059140,87,2,10,30,45,1493,452,969,72


In [11]:
def clean_zipcode(zipcode):
    zipcode = str(int(zipcode))
    #should be five digits
    length = len(zipcode)
    missing = 5 - length
    zipcode = missing*"0" + zipcode
    return zipcode
joined_db['zipcode'] = joined_db['zipcode'].map(clean_zipcode)    

It seems there are a few duplicate departments for some reason. Just drop them for now

In [12]:
duplicates = []
for agency in joined_db['agency'].unique():
    departments = joined_db[joined_db['agency']==agency]
    if len(departments)>1:
        print agency,len(departments)
        duplicates.append(agency)

Cook County Sheriffs Office 3
Dekalb Police Dept 2
Dupage County Sheriffs Office 2
East St Louis Police Department 2
Quincy Police Department 2


In [13]:
joined_db = joined_db[joined_db['agency'].isin(duplicates)==False]
joined_db

Unnamed: 0,surveyid,agency,city,state,zipcode,stops_total,searches_total,hits_total,stops_white,searches_white,...,finalwt_page2on,violent_crime_total,murder_and_nonnegligent_manslaughter,forcible_rape,robbery,aggravated_assault,property_crime_total,burglary,larceny_theft,motor_vehicle_theft
0,562,Alamance County Sheriff'S Office,Graham,NC,27253,32934,4489,1421,22173,2721,...,1.123188,116,1,7,8,100,1033,429,526,78
1,2095,Asheboro Police Department,Asheboro,NC,27204,52566,2261,760,43914,1614,...,2.217590,77,1,13,27,36,1798,404,1342,52
2,563,Asheville Police Department,Asheville,NC,28802,114953,9278,2801,91816,5973,...,1.059140,423,6,33,141,243,4910,742,3832,336
3,2103,Brunswick County Sheriffs Office,Bolivia,NC,28422,18665,2324,1012,13128,1264,...,4.903811,108,1,20,17,70,1825,684,1052,89
4,565,Burlington Police Department,Burlington,NC,27216,110905,7487,2295,66999,3656,...,1.059140,406,2,17,105,282,2792,671,1998,123
5,566,Cabarrus County Sheriffs Office,Concord,NC,28026,27334,1653,783,20355,909,...,1.123188,39,3,7,9,20,910,343,536,31
6,2106,Carteret County Sheriff,Beaufort,NC,28516,1806,543,477,1191,89,...,4.903811,87,3,20,6,58,774,246,480,48
7,567,Cary Police Department,Cary,NC,27512,237210,4655,1636,170267,2845,...,1.059140,115,0,11,29,75,1909,370,1476,63
8,568,Catawba County Sheriff Office,Newton,NC,28658,13218,1712,958,10322,781,...,1.123188,132,9,7,22,94,1803,737,991,75
9,569,Chapel Hill Police Department,Chapel Hill,NC,27514,82922,3723,1538,56579,1705,...,1.059140,87,2,10,30,45,1493,452,969,72


In [14]:
joined_db.to_sql('traffic_joined_with_features_plus_crimes',engine,if_exists='replace')

## Also create joined dataset not including traffic
can't do this currently since crime data is only for traffic states

In [None]:
query="""
SELECT survey.*,census.*
FROM census_population_sum census
JOIN police_surveys survey ON census.surveyid = survey.surveyid;
"""
joined_no_traffic_db = pd.read_sql(query,con_comb)
#get rid of duplicate surveyid columns, but leave one
surveyid_series = joined_no_traffic_db['surveyid'].ix[:,0]
joined_no_traffic_db = joined_no_traffic_db.drop('surveyid',axis=1)
joined_no_traffic_db['surveyid'] = surveyid_series

#drop index
joined_no_traffic_db = joined_no_traffic_db.drop('index',axis=1)
#grab string features we want
string_features = joined_no_traffic_db[['surveyid','agency','city','state','zipcode']]
#drop those plust some we don't want
joined_no_traffic_db = joined_no_traffic_db.drop(['state',
                             'resptype','formtype','city','agency',
                             'interpdesc','othpatdesc','othcpdesc',
                             'terrprepdesc','secgundesc','othimpdesc',
                             'othchemdesc','othactdesc','othaccdesc','ori',
                             'surveyid','zipcode'],axis=1)
#convert everything else to numeric features
joined_no_traffic_db = joined_no_traffic_db.apply(lambda x: pd.to_numeric(x, errors='ignore'))
#then put the numeric and desired string features back
joined_no_traffic_db = pd.concat([string_features,joined_no_traffic_db],axis=1)
#prettify the agency name
joined_no_traffic_db['agency'] = joined_no_traffic_db['agency'].map(str.title)
joined_no_traffic_db

In [None]:
joined_no_traffic_db.to_sql('all_pd_joined_features',engine,if_exists='replace')