In [2]:
import pandas as pd 
# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later) 
denver_data = pd.read_csv("../datasets/denver-data.csv") 
# Preview the first 5 lines of the loaded data 
denver_data.head(5)

Unnamed: 0,INCIDENT_ID,OFFENSE_ID,OFFENSE_CODE,OFFENSE_CODE_EXTENSION,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,FIRST_OCCURRENCE_DATE,LAST_OCCURRENCE_DATE,REPORTED_DATE,INCIDENT_ADDRESS,GEO_X,GEO_Y,GEO_LON,GEO_LAT,DISTRICT_ID,PRECINCT_ID,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
0,2016377000.0,2016380000000000.0,5213,0,weapon-unlawful-discharge-of,all-other-crimes,6/15/2016 11:31:00 PM,,6/15/2016 11:31:00 PM,,3193983.0,1707251.0,-104.809881,39.773188,5,521,montbello,1,0
1,20186000000.0,2.0186e+16,2399,0,theft-other,larceny,10/11/2017 12:30:00 PM,10/11/2017 4:55:00 PM,1/29/2018 5:53:00 PM,,3201943.0,1711852.0,-104.781434,39.785649,5,522,gateway-green-valley-ranch,1,0
2,20166000000.0,2.0166e+16,2305,0,theft-items-from-vehicle,theft-from-motor-vehicle,3/4/2016 8:00:00 PM,4/25/2016 8:00:00 AM,4/26/2016 9:02:00 PM,2932 S JOSEPHINE ST,3152762.0,1667011.0,-104.957381,39.66349,3,314,wellshire,1,0
3,201872300.0,201872000000000.0,2399,0,theft-other,larceny,1/30/2018 7:20:00 PM,,1/30/2018 10:29:00 PM,705 S COLORADO BLVD,3157162.0,1681320.0,-104.94144,39.702698,3,312,belcaro,1,0
4,2017411000.0,2017410000000000.0,2303,0,theft-shoplift,larceny,6/22/2017 8:53:00 PM,,6/23/2017 4:09:00 PM,2810 E 1ST AVE,3153211.0,1686545.0,-104.95537,39.717107,3,311,cherry-creek,1,0


In [3]:
# Drop uneeded columns from dataset
new_denver_data = denver_data.drop(['INCIDENT_ID', 'OFFENSE_ID', 'OFFENSE_CODE', 
                                    'OFFENSE_CODE_EXTENSION', 'OFFENSE_TYPE_ID',
                                    'OFFENSE_CATEGORY_ID', 'FIRST_OCCURRENCE_DATE',
                                    'LAST_OCCURRENCE_DATE', 'REPORTED_DATE',
                                    'GEO_X', 'GEO_Y', 'DISTRICT_ID',
                                    'PRECINCT_ID', 'IS_CRIME', 'IS_TRAFFIC'
                                   ], axis=1)
new_denver_data.head(5)

Unnamed: 0,INCIDENT_ADDRESS,GEO_LON,GEO_LAT,NEIGHBORHOOD_ID
0,,-104.809881,39.773188,montbello
1,,-104.781434,39.785649,gateway-green-valley-ranch
2,2932 S JOSEPHINE ST,-104.957381,39.66349,wellshire
3,705 S COLORADO BLVD,-104.94144,39.702698,belcaro
4,2810 E 1ST AVE,-104.95537,39.717107,cherry-creek


In [18]:
# Renaming the columns here
data = new_denver_data.rename(columns={"INCIDENT_ADDRESS": "location_name", "NEIGHBORHOOD_ID": "neighborhood",
                                      "GEO_LON": "longitude", "GEO_LAT": "latitude"}, errors="raise")
data.head(20)

Unnamed: 0,location_name,longitude,latitude,neighborhood
0,,-104.809881,39.773188,montbello
1,,-104.781434,39.785649,gateway-green-valley-ranch
2,2932 S JOSEPHINE ST,-104.957381,39.66349,wellshire
3,705 S COLORADO BLVD,-104.94144,39.702698,belcaro
4,2810 E 1ST AVE,-104.95537,39.717107,cherry-creek
5,2100 BLOCK E 17TH AVE,-104.961928,39.743149,city-park-west
6,995 N FEDERAL BLVD,-105.025543,39.73279,villa-park
7,E SPEER BLVD / N GRANT ST,-104.983794,39.723423,speer
8,W 13TH AVE / N CHEROKEE ST,-104.99165,39.736863,civic-center
9,2828 N ZUNI ST,-105.015451,39.757627,highland


In [19]:
print('Total number of rows, including duplicates:', data.shape[0])

Total number of rows, including duplicates: 466840


In [20]:
# Dropping our duplicate location_name rows here
data.drop_duplicates(subset ="location_name", inplace = True) 
print('Total number of rows, without duplicates:', data.shape[0])

Total number of rows, without duplicates: 92432


In [21]:
data.head(5)

Unnamed: 0,location_name,longitude,latitude,neighborhood
0,,-104.809881,39.773188,montbello
2,2932 S JOSEPHINE ST,-104.957381,39.66349,wellshire
3,705 S COLORADO BLVD,-104.94144,39.702698,belcaro
4,2810 E 1ST AVE,-104.95537,39.717107,cherry-creek
5,2100 BLOCK E 17TH AVE,-104.961928,39.743149,city-park-west


In [23]:
# Here we are adding Denver to city for the entire dataset
data['city'] = ['Denver' for i in range(data.shape[0])]
data.head(5)

Unnamed: 0,location_name,longitude,latitude,neighborhood,city
0,,-104.809881,39.773188,montbello,Denver
2,2932 S JOSEPHINE ST,-104.957381,39.66349,wellshire,Denver
3,705 S COLORADO BLVD,-104.94144,39.702698,belcaro,Denver
4,2810 E 1ST AVE,-104.95537,39.717107,cherry-creek,Denver
5,2100 BLOCK E 17TH AVE,-104.961928,39.743149,city-park-west,Denver


In [25]:
# Create a unique id per row in the database for location_key
import uuid
location_key = []
for i in range(data.shape[0]):
    id = uuid.uuid4() 
    location_key.append(id)
print(len(location_key))

92432


In [27]:
# Add location_key into dataframe
data['location_key'] = location_key
data.head(5)

Unnamed: 0,location_name,longitude,latitude,neighborhood,city,location_key
0,,-104.809881,39.773188,montbello,Denver,dcd36c09-2095-4935-bac2-62bb9ba8738e
2,2932 S JOSEPHINE ST,-104.957381,39.66349,wellshire,Denver,beb00ff6-50f9-47d4-b6ce-fccced9416df
3,705 S COLORADO BLVD,-104.94144,39.702698,belcaro,Denver,94bb4127-7768-4ac9-bac2-9bf22d883ea4
4,2810 E 1ST AVE,-104.95537,39.717107,cherry-creek,Denver,63c820a1-a440-47ba-a3b9-c6005182d47a
5,2100 BLOCK E 17TH AVE,-104.961928,39.743149,city-park-west,Denver,22a9e68a-1afd-4f22-ad22-9a499647ce48


In [28]:
cols = data.columns.tolist()
print(cols)

['location_name', 'longitude', 'latitude', 'neighborhood', 'city', 'location_key']


In [29]:
# Here we shift the order of the columns so it goes id, name, neighborhood, city, long, lat - to match sql table.
new_cols = [cols[5], cols[0], cols[3], cols[4], cols[1], cols[2]]
data = data[new_cols]
data.head(5)

Unnamed: 0,location_key,location_name,neighborhood,city,longitude,latitude
0,dcd36c09-2095-4935-bac2-62bb9ba8738e,,montbello,Denver,-104.809881,39.773188
2,beb00ff6-50f9-47d4-b6ce-fccced9416df,2932 S JOSEPHINE ST,wellshire,Denver,-104.957381,39.66349
3,94bb4127-7768-4ac9-bac2-9bf22d883ea4,705 S COLORADO BLVD,belcaro,Denver,-104.94144,39.702698
4,63c820a1-a440-47ba-a3b9-c6005182d47a,2810 E 1ST AVE,cherry-creek,Denver,-104.95537,39.717107
5,22a9e68a-1afd-4f22-ad22-9a499647ce48,2100 BLOCK E 17TH AVE,city-park-west,Denver,-104.961928,39.743149


In [30]:
# Finally here we convert the dataframe to a csv file to store in our repo
denver_address_csv = data.to_csv(r'./denver-address.csv', index = None, header=True)