In [1]:
import pandas as pd
# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later)

vancouver_data = pd.read_csv("./vancouver-parent_data.csv")

# Preview the first 5 lines of the loaded data 
vancouver_data.head(5)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0
1,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0
2,Break and Enter Commercial,2017,11,14,20,0,10XX ALBERNI ST,West End,491051.085574,5459144.0
3,Break and Enter Commercial,2018,3,2,6,17,10XX ALBERNI ST,West End,491058.816893,5459123.0
4,Break and Enter Commercial,2015,2,4,20,53,10XX ALBERNI ST,West End,491067.645985,5459114.0


In [2]:
# Drop uneeded columns from dataset
vancouver_data = vancouver_data.drop(['YEAR', 'MONTH', 'DAY', 'HUNDRED_BLOCK', 'NEIGHBOURHOOD', 'X', 'Y'], axis=1)
vancouver_data.head(5)

Unnamed: 0,TYPE,HOUR,MINUTE
0,Break and Enter Commercial,2,6
1,Break and Enter Commercial,4,12
2,Break and Enter Commercial,20,0
3,Break and Enter Commercial,6,17
4,Break and Enter Commercial,20,53


In [3]:
# Here we create a map to turn denver and vancouver crime granularity into the same level
# We basically map denvers more rich data into buckets that work for vancouver as well
crime_category_dict = {
                       'Break and Enter Commercial': 'Burglary',
                       'Break and Enter Residential/Other': 'Burglary',
                       'Homicide': 'Homicide',
                       'Mischief': 'Mischief',
                       'Offence Against a Person': 'Offence Against a Person',
                       'Other Theft': 'Theft',
                       'Theft from Vehicle': 'Theft from Vehicle',
                       'Theft of Bicycle': 'Theft', #assumptions: bicycles are not expensive enough
                       'Theft of Vehicle': 'Theft of Vehicle',
                       'Vehicle Collision or Pedestrian Struck (with Fatality)': 'Traffic Accident',
                       'Vehicle Collision or Pedestrian Struck (with Injury)': 'Traffic Accident',
                       'all-other-crimes': 'Other Crimes',
                       'larceny': 'Theft',
                       'theft-from-motor-vehicle': 'Theft from Vehicle',
                       'traffic-accident': 'Traffic Accident',
                       'drug-alcohol': 'Other Crimes', #Not sure about this one
                       'auto-theft': 'Theft of Vehicle',
                       'white-collar-crime': 'Other  Crimes', #ponzy scheme, wage fraud
                       'burglary': 'Burglary',
                       'public-disorder': 'Mischief',
                       'aggravated-assault': 'Offence Against a Person',
                       'other-crimes-against-persons': 'Offence Against a Person',
                       'robbery': 'Theft',
                       'sexual-assault': 'Offence Against a Person',
                       'murder': 'Homicide',
                       'arson': 'Mischief' #setting things on fire
                      }

In [4]:
# Map the crime type to the buckets we already came up with
new_type = []
for crime_type in vancouver_data['TYPE']:
    new_type.append(crime_category_dict[crime_type])
vancouver_data['NEW_TYPE'] = new_type
vancouver_data.head(5)

Unnamed: 0,TYPE,HOUR,MINUTE,NEW_TYPE
0,Break and Enter Commercial,2,6,Burglary
1,Break and Enter Commercial,4,12,Burglary
2,Break and Enter Commercial,20,0,Burglary
3,Break and Enter Commercial,6,17,Burglary
4,Break and Enter Commercial,20,53,Burglary


In [5]:
import datetime

new_time_values = []
for index, sample_min in vancouver_data.iterrows():
    new_time_values.append(datetime.time(int(sample_min['HOUR']), int(sample_min['MINUTE'])))

vancouver_data['Recorded_Time'] = new_time_values

vancouver_data.head()

Unnamed: 0,TYPE,HOUR,MINUTE,NEW_TYPE,Recorded_Time
0,Break and Enter Commercial,2,6,Burglary,02:06:00
1,Break and Enter Commercial,4,12,Burglary,04:12:00
2,Break and Enter Commercial,20,0,Burglary,20:00:00
3,Break and Enter Commercial,6,17,Burglary,06:17:00
4,Break and Enter Commercial,20,53,Burglary,20:53:00


In [6]:
vancouver_data = vancouver_data.drop(columns = ['HOUR', 'MINUTE'])

In [7]:
vancouver_data["Crime-start-time"] = None 


In [8]:
vancouver_data["Crime-end-time"] = None

vancouver_data.head(10)

Unnamed: 0,TYPE,NEW_TYPE,Recorded_Time,Crime-start-time,Crime-end-time
0,Break and Enter Commercial,Burglary,02:06:00,,
1,Break and Enter Commercial,Burglary,04:12:00,,
2,Break and Enter Commercial,Burglary,20:00:00,,
3,Break and Enter Commercial,Burglary,06:17:00,,
4,Break and Enter Commercial,Burglary,20:53:00,,
5,Break and Enter Commercial,Burglary,04:41:00,,
6,Break and Enter Commercial,Burglary,21:00:00,,
7,Break and Enter Commercial,Burglary,02:00:00,,
8,Break and Enter Commercial,Burglary,18:00:00,,
9,Break and Enter Commercial,Burglary,03:00:00,,


In [9]:
print(vancouver_data['TYPE'].unique())

['Break and Enter Commercial' 'Break and Enter Residential/Other'
 'Homicide' 'Mischief' 'Offence Against a Person' 'Other Theft'
 'Theft from Vehicle' 'Theft of Bicycle' 'Theft of Vehicle'
 'Vehicle Collision or Pedestrian Struck (with Fatality)'
 'Vehicle Collision or Pedestrian Struck (with Injury)']


In [10]:
vancouver_data.rename(columns={'Recorded_Time': 'Crime_report_time'}, inplace=True)

In [11]:
vancouver_data = vancouver_data.drop(columns = ['TYPE'])
vancouver_data.rename(columns={'NEW_TYPE': 'crime_type', 'Crime_report_time': 'crime_report_time', 'Crime-start-time': 'crime_start_time', 'Crime-end-time' : 'crime_end_time'}, inplace=True)

In [12]:
vancouver_data.head()

Unnamed: 0,crime_type,crime_report_time,crime_start_time,crime_end_time
0,Burglary,02:06:00,,
1,Burglary,04:12:00,,
2,Burglary,20:00:00,,
3,Burglary,06:17:00,,
4,Burglary,20:53:00,,


In [13]:
# Create a unique id per row in the database for location_key
import uuid
crime_key = []
for i in range(vancouver_data.shape[0]):
    id = uuid.uuid4() 
    crime_key.append(id)
    
vancouver_data['crime_key'] = crime_key
vancouver_data.head()

Unnamed: 0,crime_type,crime_report_time,crime_start_time,crime_end_time,crime_key
0,Burglary,02:06:00,,,9473fc10-b616-4d33-ac4c-713de667df22
1,Burglary,04:12:00,,,c1f9bf44-cf15-4d74-87c5-9a4b407e5ad4
2,Burglary,20:00:00,,,3bbd5df5-623d-450f-bf5f-bc94bd2c788e
3,Burglary,06:17:00,,,fd376b38-38b0-4113-9543-1ebca04f39b2
4,Burglary,20:53:00,,,8b3562ca-f19e-4d75-8a2f-75501912f695


In [14]:
cols = vancouver_data.columns.tolist()
new_cols = [cols[4]]
new_cols = new_cols + cols[0:4]
print(new_cols)

['crime_key', 'crime_type', 'crime_report_time', 'crime_start_time', 'crime_end_time']


In [15]:
vancouver_data = vancouver_data[new_cols]

vancouver_data.head()

Unnamed: 0,crime_key,crime_type,crime_report_time,crime_start_time,crime_end_time
0,9473fc10-b616-4d33-ac4c-713de667df22,Burglary,02:06:00,,
1,c1f9bf44-cf15-4d74-87c5-9a4b407e5ad4,Burglary,04:12:00,,
2,3bbd5df5-623d-450f-bf5f-bc94bd2c788e,Burglary,20:00:00,,
3,fd376b38-38b0-4113-9543-1ebca04f39b2,Burglary,06:17:00,,
4,8b3562ca-f19e-4d75-8a2f-75501912f695,Burglary,20:53:00,,


In [16]:
vancouver_data.to_csv(r'./vancouver-crime.csv', index = None, header=True)