In [1]:
import pandas as pd
# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later)

vancouver_data = pd.read_csv("./vancouver-parent_data.csv")

# Preview the first 5 lines of the loaded data 
vancouver_data.head(5)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,Break and Enter Commercial,2012,12,14,8,52,,Oakridge,491285.0,5453433.0
1,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0
2,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0
3,Break and Enter Commercial,2014,8,8,5,13,10XX ALBERNI ST,West End,491015.943352,5459166.0
4,Break and Enter Commercial,2005,11,14,3,9,10XX ALBERNI ST,West End,491021.385727,5459161.0


In [2]:
# Drop uneeded columns from dataset
vancouver_data = vancouver_data.drop(['YEAR', 'MONTH', 'DAY', 'HUNDRED_BLOCK', 'NEIGHBOURHOOD', 'X', 'Y'], axis=1)
vancouver_data.head(5)

Unnamed: 0,TYPE,HOUR,MINUTE
0,Break and Enter Commercial,8,52
1,Break and Enter Commercial,2,6
2,Break and Enter Commercial,4,12
3,Break and Enter Commercial,5,13
4,Break and Enter Commercial,3,9


In [3]:
# Here we create a map to turn denver and vancouver crime granularity into the same level
# We basically map denvers more rich data into buckets that work for vancouver as well
crime_category_dict = {
                       'Break and Enter Commercial': 'Burglary',
                       'Break and Enter Residential/Other': 'Burglary',
                       'Homicide': 'Homicide',
                       'Mischief': 'Mischief',
                       'Offence Against a Person': 'Offence Against a Person',
                       'Other Theft': 'Theft',
                       'Theft from Vehicle': 'Theft from Vehicle',
                       'Theft of Bicycle': 'Theft', #assumptions: bicycles are not expensive enough
                       'Theft of Vehicle': 'Theft of Vehicle',
                       'Vehicle Collision or Pedestrian Struck (with Fatality)': 'Traffic Accident',
                       'Vehicle Collision or Pedestrian Struck (with Injury)': 'Traffic Accident',
                       'all-other-crimes': 'Other Crimes',
                       'larceny': 'Theft',
                       'theft-from-motor-vehicle': 'Theft from Vehicle',
                       'traffic-accident': 'Traffic Accident',
                       'drug-alcohol': 'Other Crimes', #Not sure about this one
                       'auto-theft': 'Theft of Vehicle',
                       'white-collar-crime': 'Other  Crimes', #ponzy scheme, wage fraud
                       'burglary': 'Burglary',
                       'public-disorder': 'Mischief',
                       'aggravated-assault': 'Offence Against a Person',
                       'other-crimes-against-persons': 'Offence Against a Person',
                       'robbery': 'Theft',
                       'sexual-assault': 'Offence Against a Person',
                       'murder': 'Homicide',
                       'arson': 'Mischief' #setting things on fire
                      }

In [4]:
# Map the crime type to the buckets we already came up with
new_type = []
for crime_type in vancouver_data['TYPE']:
    new_type.append(crime_category_dict[crime_type])
vancouver_data['NEW_TYPE'] = new_type
vancouver_data.head(5)

Unnamed: 0,TYPE,HOUR,MINUTE,NEW_TYPE
0,Break and Enter Commercial,8,52,Burglary
1,Break and Enter Commercial,2,6,Burglary
2,Break and Enter Commercial,4,12,Burglary
3,Break and Enter Commercial,5,13,Burglary
4,Break and Enter Commercial,3,9,Burglary


In [5]:
import datetime

new_time_values = []
for index, sample_min in vancouver_data.iterrows():
    new_time_values.append(datetime.time(int(sample_min['HOUR']), int(sample_min['MINUTE'])))

vancouver_data['Recorded_Time'] = new_time_values

vancouver_data.head()

Unnamed: 0,TYPE,HOUR,MINUTE,NEW_TYPE,Recorded_Time
0,Break and Enter Commercial,8,52,Burglary,08:52:00
1,Break and Enter Commercial,2,6,Burglary,02:06:00
2,Break and Enter Commercial,4,12,Burglary,04:12:00
3,Break and Enter Commercial,5,13,Burglary,05:13:00
4,Break and Enter Commercial,3,9,Burglary,03:09:00


In [6]:
vancouver_data = vancouver_data.drop(columns = ['HOUR', 'MINUTE'])

In [7]:
vancouver_data["Crime-start-time"] = None 


In [8]:
vancouver_data["Crime-end-time"] = None

vancouver_data.head(10)

Unnamed: 0,TYPE,NEW_TYPE,Recorded_Time,Crime-start-time,Crime-end-time
0,Break and Enter Commercial,Burglary,08:52:00,,
1,Break and Enter Commercial,Burglary,02:06:00,,
2,Break and Enter Commercial,Burglary,04:12:00,,
3,Break and Enter Commercial,Burglary,05:13:00,,
4,Break and Enter Commercial,Burglary,03:09:00,,
5,Break and Enter Commercial,Burglary,04:50:00,,
6,Break and Enter Commercial,Burglary,00:35:00,,
7,Break and Enter Commercial,Burglary,20:00:00,,
8,Break and Enter Commercial,Burglary,05:50:00,,
9,Break and Enter Commercial,Burglary,14:20:00,,


In [9]:
print(vancouver_data['TYPE'].unique())

['Break and Enter Commercial' 'Break and Enter Residential/Other'
 'Homicide' 'Mischief' 'Offence Against a Person' 'Other Theft'
 'Theft from Vehicle' 'Theft of Bicycle' 'Theft of Vehicle'
 'Vehicle Collision or Pedestrian Struck (with Fatality)'
 'Vehicle Collision or Pedestrian Struck (with Injury)']


In [10]:
vancouver_data.rename(columns={'Recorded_Time': 'Crime_report_time'}, inplace=True)

In [11]:
vancouver_data = vancouver_data.drop(columns = ['TYPE'])
vancouver_data.rename(columns={'NEW_TYPE': 'crime_type', 'Crime_report_time': 'crime_report_time', 'Crime-start-time': 'crime_start_time', 'Crime-end-time' : 'crime_end_time'}, inplace=True)

In [12]:
vancouver_data.head()

Unnamed: 0,crime_type,crime_report_time,crime_start_time,crime_end_time
0,Burglary,08:52:00,,
1,Burglary,02:06:00,,
2,Burglary,04:12:00,,
3,Burglary,05:13:00,,
4,Burglary,03:09:00,,


In [13]:
# Create a unique id per row in the database for location_key
import uuid
crime_key = []
for i in range(vancouver_data.shape[0]):
    id = uuid.uuid4() 
    crime_key.append(id)
    
vancouver_data['crime_key'] = crime_key
vancouver_data.head()

Unnamed: 0,crime_type,crime_report_time,crime_start_time,crime_end_time,crime_key
0,Burglary,08:52:00,,,3d2ba038-e1a2-4975-bd15-82b3b93c0895
1,Burglary,02:06:00,,,c92f172b-145c-4df3-a651-709991a6713d
2,Burglary,04:12:00,,,2c564e35-85c4-456f-b978-ba3d519eb9b1
3,Burglary,05:13:00,,,d806442c-8dfb-496a-ab2d-30f7867152b4
4,Burglary,03:09:00,,,fb92f9b6-7e24-419f-8946-d193914b27ad


In [14]:
cols = vancouver_data.columns.tolist()
new_cols = [cols[4]]
new_cols = new_cols + cols[0:4]
print(new_cols)

['crime_key', 'crime_type', 'crime_report_time', 'crime_start_time', 'crime_end_time']


In [15]:
vancouver_data = vancouver_data[new_cols]

vancouver_data.head()

Unnamed: 0,crime_key,crime_type,crime_report_time,crime_start_time,crime_end_time
0,3d2ba038-e1a2-4975-bd15-82b3b93c0895,Burglary,08:52:00,,
1,c92f172b-145c-4df3-a651-709991a6713d,Burglary,02:06:00,,
2,2c564e35-85c4-456f-b978-ba3d519eb9b1,Burglary,04:12:00,,
3,d806442c-8dfb-496a-ab2d-30f7867152b4,Burglary,05:13:00,,
4,fb92f9b6-7e24-419f-8946-d193914b27ad,Burglary,03:09:00,,


In [17]:
vancouver_data.to_csv(r'./vancouver-crime.csv', index = None, header=True)