# Crime data prep **Part 1**:
---
- Cleaning records from source
- Refactoring
- Standardizing county names
- Generating new columns based on date (day of week, quarter, year)
- Combining the 97-15 data and 16-19 data

In [1]:
import crime as cr
import pandas as pd, numpy as np

In [2]:
display(cr.load('crime_16_19', full=True))
display(cr.load('crime_97_15', full=True))

Unnamed: 0,pub_agency_name,county_name,incident_date,incident_hour,offense_name,crime_against,offense_category_name,offense_group,age_num
0,Westminster,JEFFERSON; ADAMS,2017-08-26,17.0,Aggravated Assault,Person,Assault Offenses,A,52.0
1,Westminster,JEFFERSON; ADAMS,2017-11-22,20.0,Aggravated Assault,Person,Assault Offenses,A,29.0
2,Westminster,JEFFERSON; ADAMS,2017-12-28,21.0,Motor Vehicle Theft,Property,Motor Vehicle Theft,A,
3,Del Norte,RIO GRANDE,2017-01-03,11.0,Destruction/Damage/Vandalism of Property,Property,Destruction/Damage/Vandalism of Property,A,
4,Thornton,ADAMS,2017-01-16,20.0,Destruction/Damage/Vandalism of Property,Property,Destruction/Damage/Vandalism of Property,A,24.0
...,...,...,...,...,...,...,...,...,...
1851991,Boulder,BOULDER,2019-10-16,21.0,All Other Larceny,Property,Larceny/Theft Offenses,A,
1851992,Boulder,BOULDER,2019-10-16,21.0,All Other Larceny,Property,Larceny/Theft Offenses,A,
1851993,Arapahoe,ARAPAHOE,2019-06-01,18.0,Shoplifting,Property,Larceny/Theft Offenses,A,20.0
1851994,Thornton,ADAMS,2019-01-21,12.0,Destruction/Damage/Vandalism of Property,Property,Destruction/Damage/Vandalism of Property,A,15.0


Unnamed: 0,agency_name,agency_type_name,city_name,primary_county,offense_name,crime_against,offense_category_name,age_num,incident_date,incident_hour
0,Lyons Police Department,City,Lyons,Boulder,,,,,NaT,
1,Kremmling Police Department,City,Kremmling,Grand,,,,,NaT,
2,Oak Creek Police Department,City,Oak Creek,Routt,,,,,NaT,
3,Ault Police Department,City,Ault,Weld,,,,,NaT,
4,Romeo Police Department,City,Romeo,Conejos,,,,,NaT,
...,...,...,...,...,...,...,...,...,...,...
4952277,Gypsum Police Department,City,Gypsum,Eagle,Simple Assault,Person,Assault Offenses,44.0,2015-11-15,11.0
4952278,Gypsum Police Department,City,Gypsum,Eagle,Weapon Law Violations,Society,Weapon Law Violations,15.0,2015-11-22,15.0
4952279,All Crimes Enforcement Team,Other,Viola,Moffat,Drug/Narcotic Violations,Society,Drug/Narcotic Offenses,17.0,2015-11-03,21.0
4952280,All Crimes Enforcement Team,Other,Viola,Moffat,Drug/Narcotic Violations,Society,Drug/Narcotic Offenses,20.0,2015-11-03,21.0


### Crime 16-19

In [3]:
df1 = cr.df('crime_16_19')

# remove State Patrol and CBI crimes
df1 = df1.loc[ ~ df1.pub_agency_name.isin(['State Patrol', 'Colorado Bureau of Investigation'])]

# Drop rows where pub agency name is null
df1 = df1[ ~ df1.pub_agency_name.isna()]

# Since we're focused on county and not police department, replace dual county
# police department county values (Ex: "JEFFERSON; ADAMS") with just the primary (Ex: "JEFFERSON")
df1.county_name = df1.county_name.str.split('; ').str[0]

# Change county name from uppercase to title case
df1.county_name = df1.county_name.str.title()

# Rename county column
df1 = df1.rename(columns={
        'county_name':      'county', 
        'pub_agency_name':  'police_dept',
        'incident_date':    'date',
        'incident_hour':    'hour',
        'age_num':          'age',
        'offense_category_name': 'offense_category',
    })

df1.county = df1.county.str.upper()
df1['year'] = pd.DatetimeIndex(df1.date).year
df1['day_of_week'] = pd.DatetimeIndex(df1.date).day_of_week
df1['month'] = pd.DatetimeIndex(df1.date).month
df1['quarter'] = pd.DatetimeIndex(df1.date).quarter

# Select only needed columns
df1 = df1[[
    'year', 'county', 'police_dept', 'date', 'quarter', 'month', 'day_of_week', 'hour',
    'age', 'crime_against', 'offense_name', 'offense_category'
    ]]

df1

Unnamed: 0,year,county,police_dept,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
0,2017,JEFFERSON,Westminster,2017-08-26,3,8,5,17.0,52.0,Person,Aggravated Assault,Assault Offenses
1,2017,JEFFERSON,Westminster,2017-11-22,4,11,2,20.0,29.0,Person,Aggravated Assault,Assault Offenses
2,2017,JEFFERSON,Westminster,2017-12-28,4,12,3,21.0,,Property,Motor Vehicle Theft,Motor Vehicle Theft
3,2017,RIO GRANDE,Del Norte,2017-01-03,1,1,1,11.0,,Property,Destruction/Damage/Vandalism of Property,Destruction/Damage/Vandalism of Property
4,2017,ADAMS,Thornton,2017-01-16,1,1,0,20.0,24.0,Property,Destruction/Damage/Vandalism of Property,Destruction/Damage/Vandalism of Property
...,...,...,...,...,...,...,...,...,...,...,...,...
1851991,2019,BOULDER,Boulder,2019-10-16,4,10,2,21.0,,Property,All Other Larceny,Larceny/Theft Offenses
1851992,2019,BOULDER,Boulder,2019-10-16,4,10,2,21.0,,Property,All Other Larceny,Larceny/Theft Offenses
1851993,2019,ARAPAHOE,Arapahoe,2019-06-01,2,6,5,18.0,20.0,Property,Shoplifting,Larceny/Theft Offenses
1851994,2019,ADAMS,Thornton,2019-01-21,1,1,0,12.0,15.0,Property,Destruction/Damage/Vandalism of Property,Destruction/Damage/Vandalism of Property


### Crime 97-15

In [4]:
df2 = cr.df('crime_97_15')

# All the rows with null dates appear to be bullshit records and can be removed
df2 = df2[ ~ df2.incident_date.isna()]

# Remove State Patrol and CBI crimes
df2 = df2.loc[ ~ df2.agency_name.isin(['State Patrol', 'Colorado Bureau of Investigation'])]

# Rename county column
df2 = df2.rename(columns={
        'primary_county':   'county',
        'agency_name':      'police_dept',
        'incident_date':    'date',
        'incident_hour':    'hour',
        'age_num':          'age',
        'offense_category_name': 'offense_category',
    })


df2.county = df2.county.str.upper()
df2['year'] = pd.DatetimeIndex(df2.date).year
df2['day_of_week'] = pd.DatetimeIndex(df2.date).day_of_week
df2['month'] = pd.DatetimeIndex(df2.date).month
df2['quarter'] = pd.DatetimeIndex(df2.date).quarter

# Select only needed columns
df2 = df2[[
    'year', 'county', 'police_dept', 'date', 'quarter', 'month', 'day_of_week', 'hour',
    'age', 'crime_against', 'offense_name', 'offense_category']]

df2

Unnamed: 0,year,county,police_dept,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
24,1997,BOULDER,Longmont Police Department,1997-03-14,1,3,4,,15.0,Person,Fondling,Sex Offenses
25,1997,BOULDER,Longmont Police Department,1997-07-02,3,7,2,21.0,14.0,Property,Arson,Arson
26,1997,KIT CARSON,Kit Carson County Sheriff's Office,1997-01-20,1,1,0,22.0,58.0,Person,Simple Assault,Assault Offenses
27,1997,KIT CARSON,Kit Carson County Sheriff's Office,1997-01-18,1,1,5,,21.0,Property,All Other Larceny,Larceny/Theft Offenses
28,1997,KIT CARSON,Kit Carson County Sheriff's Office,1997-03-31,1,3,0,,,Property,Destruction/Damage/Vandalism of Property,Destruction/Damage/Vandalism of Property
...,...,...,...,...,...,...,...,...,...,...,...,...
4952277,2015,EAGLE,Gypsum Police Department,2015-11-15,4,11,6,11.0,44.0,Person,Simple Assault,Assault Offenses
4952278,2015,EAGLE,Gypsum Police Department,2015-11-22,4,11,6,15.0,15.0,Society,Weapon Law Violations,Weapon Law Violations
4952279,2015,MOFFAT,All Crimes Enforcement Team,2015-11-03,4,11,1,21.0,17.0,Society,Drug/Narcotic Violations,Drug/Narcotic Offenses
4952280,2015,MOFFAT,All Crimes Enforcement Team,2015-11-03,4,11,1,21.0,20.0,Society,Drug/Narcotic Violations,Drug/Narcotic Offenses


### Save

In [5]:
# Save em
df1.to_csv('output/16_19.csv', index=False)
df2.to_csv('output/97_15.csv', index=False)

## Combine into single dataset
---

In [6]:
df = pd.concat([df2, df1])
df = df.drop(columns='police_dept')

In [7]:
df

Unnamed: 0,year,county,date,quarter,month,day_of_week,hour,age,crime_against,offense_name,offense_category
24,1997,BOULDER,1997-03-14,1,3,4,,15.0,Person,Fondling,Sex Offenses
25,1997,BOULDER,1997-07-02,3,7,2,21.0,14.0,Property,Arson,Arson
26,1997,KIT CARSON,1997-01-20,1,1,0,22.0,58.0,Person,Simple Assault,Assault Offenses
27,1997,KIT CARSON,1997-01-18,1,1,5,,21.0,Property,All Other Larceny,Larceny/Theft Offenses
28,1997,KIT CARSON,1997-03-31,1,3,0,,,Property,Destruction/Damage/Vandalism of Property,Destruction/Damage/Vandalism of Property
...,...,...,...,...,...,...,...,...,...,...,...
1851991,2019,BOULDER,2019-10-16,4,10,2,21.0,,Property,All Other Larceny,Larceny/Theft Offenses
1851992,2019,BOULDER,2019-10-16,4,10,2,21.0,,Property,All Other Larceny,Larceny/Theft Offenses
1851993,2019,ARAPAHOE,2019-06-01,2,6,5,18.0,20.0,Property,Shoplifting,Larceny/Theft Offenses
1851994,2019,ADAMS,2019-01-21,1,1,0,12.0,15.0,Property,Destruction/Damage/Vandalism of Property,Destruction/Damage/Vandalism of Property


In [8]:
df.to_csv('output/all.csv', index=False)