In [1]:
import pandas as pd 
import math
# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later) 
vancover_data = pd.read_csv("../datasets/vancouver-data.csv") 
# Preview the first 5 lines of the loaded data 
vancover_data.head(5)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,Break and Enter Commercial,2012,12,14,8,52,,Oakridge,491285.0,5453433.0
1,Break and Enter Commercial,2019,3,7,2,6,10XX SITKA SQ,Fairview,490612.964805,5457110.0
2,Break and Enter Commercial,2019,8,27,4,12,10XX ALBERNI ST,West End,491007.779775,5459174.0
3,Break and Enter Commercial,2014,8,8,5,13,10XX ALBERNI ST,West End,491015.943352,5459166.0
4,Break and Enter Commercial,2005,11,14,3,9,10XX ALBERNI ST,West End,491021.385727,5459161.0


In [18]:
# Drop uneeded columns from dataset
new_van_data = vancover_data.drop(['TYPE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE'], axis=1)
new_van_data.head(5)

Unnamed: 0,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,,Oakridge,491285.0,5453433.0
1,10XX SITKA SQ,Fairview,490612.964805,5457110.0
2,10XX ALBERNI ST,West End,491007.779775,5459174.0
3,10XX ALBERNI ST,West End,491015.943352,5459166.0
4,10XX ALBERNI ST,West End,491021.385727,5459161.0


In [20]:
# This is a function that converts a x,y into long and lat values and then returns it

def utmToLatLng(zone, easting, northing, northernHemisphere=True):
    if not northernHemisphere:
        northing = 10000000 - northing

    a = 6378137
    e = 0.081819191
    e1sq = 0.006739497
    k0 = 0.9996

    arc = northing / k0
    mu = arc / (a * (1 - math.pow(e, 2) / 4.0 - 3 * math.pow(e, 4) / 64.0 - 5 * math.pow(e, 6) / 256.0))

    ei = (1 - math.pow((1 - e * e), (1 / 2.0))) / (1 + math.pow((1 - e * e), (1 / 2.0)))

    ca = 3 * ei / 2 - 27 * math.pow(ei, 3) / 32.0

    cb = 21 * math.pow(ei, 2) / 16 - 55 * math.pow(ei, 4) / 32
    cc = 151 * math.pow(ei, 3) / 96
    cd = 1097 * math.pow(ei, 4) / 512
    phi1 = mu + ca * math.sin(2 * mu) + cb * math.sin(4 * mu) + cc * math.sin(6 * mu) + cd * math.sin(8 * mu)

    n0 = a / math.pow((1 - math.pow((e * math.sin(phi1)), 2)), (1 / 2.0))

    r0 = a * (1 - e * e) / math.pow((1 - math.pow((e * math.sin(phi1)), 2)), (3 / 2.0))
    fact1 = n0 * math.tan(phi1) / r0

    _a1 = 500000 - easting
    dd0 = _a1 / (n0 * k0)
    fact2 = dd0 * dd0 / 2

    t0 = math.pow(math.tan(phi1), 2)
    Q0 = e1sq * math.pow(math.cos(phi1), 2)
    fact3 = (5 + 3 * t0 + 10 * Q0 - 4 * Q0 * Q0 - 9 * e1sq) * math.pow(dd0, 4) / 24

    fact4 = (61 + 90 * t0 + 298 * Q0 + 45 * t0 * t0 - 252 * e1sq - 3 * Q0 * Q0) * math.pow(dd0, 6) / 720

    lof1 = _a1 / (n0 * k0)
    lof2 = (1 + 2 * t0 + Q0) * math.pow(dd0, 3) / 6.0
    lof3 = (5 - 2 * Q0 + 28 * t0 - 3 * math.pow(Q0, 2) + 8 * e1sq + 24 * math.pow(t0, 2)) * math.pow(dd0, 5) / 120
    _a2 = (lof1 - lof2 + lof3) / math.cos(phi1)
    _a3 = _a2 * 180 / math.pi

    latitude = 180 * (phi1 - fact1 * (fact2 + fact3 + fact4)) / math.pi

    if not northernHemisphere:
        latitude = -latitude

    longitude = ((zone > 0) and (6 * zone - 183.0) or 3.0) - _a3

    return (latitude, longitude)

In [4]:
# Here we are sending all our x,y to get a list of corresponding lat and long
latitudes = []
longitudes = []
#longitude is the first result and latitude is the second
for i in range(0, len(new_van_data['Y'].values), 1000):
    latits = new_van_data['X'].values[i:i+1000]
    longits = new_van_data['Y'].values[i:i+1000]
    for x, y in zip(latits, longits):
        result = utmToLatLng(10, x, y)
        latitudes.append(result[0])
        longitudes.append(result[1])    
    

In [7]:
# We set our long and lat values in our dataset
new_van_data['longitude'] = longitudes
new_van_data['latitude'] = latitudes
new_van_data.head(5)

Unnamed: 0,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,longitude,latitude
0,,Oakridge,491285.0,5453433.0,-123.119712,49.233614
1,10XX SITKA SQ,Fairview,490612.964805,5457110.0,-123.129029,49.266678
2,10XX ALBERNI ST,West End,491007.779775,5459174.0,-123.123649,49.285255
3,10XX ALBERNI ST,West End,491015.943352,5459166.0,-123.123536,49.285181
4,10XX ALBERNI ST,West End,491021.385727,5459161.0,-123.123461,49.285132


In [6]:
cols = new_van_data.columns.tolist()
print(cols)

['HUNDRED_BLOCK', 'NEIGHBOURHOOD', 'X', 'Y', 'longitude', 'latitude']


In [8]:
# We get rid of X,Y columns
new_cols = [cols[0], cols[1], cols[4], cols[5]]
new_data = new_van_data[new_cols]
new_data.head(5)

Unnamed: 0,HUNDRED_BLOCK,NEIGHBOURHOOD,longitude,latitude
0,,Oakridge,-123.119712,49.233614
1,10XX SITKA SQ,Fairview,-123.129029,49.266678
2,10XX ALBERNI ST,West End,-123.123649,49.285255
3,10XX ALBERNI ST,West End,-123.123536,49.285181
4,10XX ALBERNI ST,West End,-123.123461,49.285132


In [9]:
# Renaming the columns here
data = new_data.rename(columns={"HUNDRED_BLOCK": "location_name", "NEIGHBOURHOOD": "neighborhood"}, errors="raise")
data.head(5)

Unnamed: 0,location_name,neighborhood,longitude,latitude
0,,Oakridge,-123.119712,49.233614
1,10XX SITKA SQ,Fairview,-123.129029,49.266678
2,10XX ALBERNI ST,West End,-123.123649,49.285255
3,10XX ALBERNI ST,West End,-123.123536,49.285181
4,10XX ALBERNI ST,West End,-123.123461,49.285132


In [10]:
print('Total number of rows, including duplicates:', data.size)

Total number of rows, including duplicates: 2537112


In [21]:
# Dropping our duplicate location_name rows here
data.drop_duplicates(subset ="location_name", inplace = True) 
print('Total number of rows, without duplicates:', data.size)

Total number of rows, without duplicates: 133530


In [12]:
data.head(5)

Unnamed: 0,location_name,neighborhood,longitude,latitude
0,,Oakridge,-123.119712,49.233614
1,10XX SITKA SQ,Fairview,-123.129029,49.266678
2,10XX ALBERNI ST,West End,-123.123649,49.285255
61,10XX BARCLAY ST,West End,-123.126558,49.282901
79,10XX BEACH AVE,West End,-123.134768,49.276833


In [13]:
# Here we are adding vancouver to city for the entire dataset as well as a unique id which is
data['city'] = ['Vancouver' for i in range(len(data.values))]
data['location_key'] = [i for i in range(len(data.values))]
data.head(5)

Unnamed: 0,location_name,neighborhood,longitude,latitude,city,location_key
0,,Oakridge,-123.119712,49.233614,Vancouver,0
1,10XX SITKA SQ,Fairview,-123.129029,49.266678,Vancouver,1
2,10XX ALBERNI ST,West End,-123.123649,49.285255,Vancouver,2
61,10XX BARCLAY ST,West End,-123.126558,49.282901,Vancouver,3
79,10XX BEACH AVE,West End,-123.134768,49.276833,Vancouver,4


In [14]:
print('Total number of rows:', data.size)

Total number of rows: 133530


In [15]:
cols = data.columns.tolist()
print(cols)

['location_name', 'neighborhood', 'longitude', 'latitude', 'city', 'location_key']


In [22]:
# Here we shift the order of the columns so it goes id, name, neighborhood, city, long, lat - to match sql table.
new_cols = [cols[5], cols[0], cols[1], cols[4], cols[2], cols[3]]
data = data[new_cols]
data.head(5)

Unnamed: 0,location_key,location_name,neighborhood,city,longitude,latitude
0,0,,Oakridge,Vancouver,-123.119712,49.233614
1,1,10XX SITKA SQ,Fairview,Vancouver,-123.129029,49.266678
2,2,10XX ALBERNI ST,West End,Vancouver,-123.123649,49.285255
61,3,10XX BARCLAY ST,West End,Vancouver,-123.126558,49.282901
79,4,10XX BEACH AVE,West End,Vancouver,-123.134768,49.276833


In [17]:
# Finally here we convert the dataframe to a csv file to store in our repo
vancouver_address_csv = data.to_csv(r'./vancouver-address.csv', index = None, header=True)