# Produce combined table with features and labels

In [None]:
from glob import glob
import sets
import datetime
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import numpy as np
from collections import defaultdict

In [None]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print engine.url

In [None]:
dbname = 'combined_profiling'
username = 'along528'
pswd = 'password'
con_comb = None
con_comb = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [None]:
query="""
SELECT traffic.*, census.*,survey.*
FROM combined_traffic_stops_integrate_year_surveyid traffic
JOIN census_population_sum census ON traffic.surveyid = census.surveyid
JOIN police_surveys survey ON census.surveyid = survey.surveyid;
"""
joined_db = pd.read_sql(query,con_comb)
#get rid of duplicate surveyid columns, but leave one
surveyid_series = joined_db['surveyid'].ix[:,0]
joined_db = joined_db.drop('surveyid',axis=1)
joined_db['surveyid'] = surveyid_series
#get rid of duplicate state columns, but leave one
state_series = joined_db['state'].ix[:,0]
joined_db = joined_db.drop('state',axis=1)
joined_db['state'] = state_series
#drop index
joined_db = joined_db.drop('index',axis=1)
#grab string features we want
string_features = joined_db[['surveyid','agency','city','state','zipcode']]
#drop those plust some we don't want
joined_db = joined_db.drop(['agencyname','state',
                             'resptype','formtype','city','agency',
                             'interpdesc','othpatdesc','othcpdesc',
                             'terrprepdesc','secgundesc','othimpdesc',
                             'othchemdesc','othactdesc','othaccdesc','ori',
                             'surveyid','zipcode'],axis=1)
#convert everything else to numeric features
joined_db = joined_db.apply(lambda x: pd.to_numeric(x, errors='ignore'))
#then put the numeric and desired string features back
joined_db = pd.concat([string_features,joined_db],axis=1)
#prettify the agency name
joined_db['agency'] = joined_db['agency'].map(str.title)
joined_db

In [None]:
def clean_zipcode(zipcode):
    zipcode = str(int(zipcode))
    #should be five digits
    length = len(zipcode)
    missing = 5 - length
    zipcode = missing*"0" + zipcode
    return zipcode
joined_db['zipcode'] = joined_db['zipcode'].map(clean_zipcode)    

It seems there are a few duplicate departments for some reason. Just drop them for now

In [None]:
duplicates = []
for agency in joined_db['agency'].unique():
    departments = joined_db[joined_db['agency']==agency]
    if len(departments)>1:
        print agency,len(departments)
        duplicates.append(agency)

In [None]:
joined_db = joined_db[joined_db['agency'].isin(duplicates)==False]
joined_db

In [None]:
joined_db.to_sql('traffic_joined_with_features',engine,if_exists='replace')

## Also create joined dataset not including traffic

In [None]:
query="""
SELECT survey.*,census.*
FROM census_population_sum census
JOIN police_surveys survey ON census.surveyid = survey.surveyid;
"""
joined_no_traffic_db = pd.read_sql(query,con_comb)
#get rid of duplicate surveyid columns, but leave one
surveyid_series = joined_no_traffic_db['surveyid'].ix[:,0]
joined_no_traffic_db = joined_no_traffic_db.drop('surveyid',axis=1)
joined_no_traffic_db['surveyid'] = surveyid_series

#drop index
joined_no_traffic_db = joined_no_traffic_db.drop('index',axis=1)
#grab string features we want
string_features = joined_no_traffic_db[['surveyid','agency','city','state','zipcode']]
#drop those plust some we don't want
joined_no_traffic_db = joined_no_traffic_db.drop(['state',
                             'resptype','formtype','city','agency',
                             'interpdesc','othpatdesc','othcpdesc',
                             'terrprepdesc','secgundesc','othimpdesc',
                             'othchemdesc','othactdesc','othaccdesc','ori',
                             'surveyid','zipcode'],axis=1)
#convert everything else to numeric features
joined_no_traffic_db = joined_no_traffic_db.apply(lambda x: pd.to_numeric(x, errors='ignore'))
#then put the numeric and desired string features back
joined_no_traffic_db = pd.concat([string_features,joined_no_traffic_db],axis=1)
#prettify the agency name
joined_no_traffic_db['agency'] = joined_no_traffic_db['agency'].map(str.title)
joined_no_traffic_db

In [None]:
joined_no_traffic_db.to_sql('all_pd_joined_features',engine,if_exists='replace')

In [None]:
'999999'  in joined_no_traffic_db['chiefmin'].values
