In [1]:
import os
import pandas as pd
import numpy as np
import sklearn.neighbors as neighbors

from math import *
from collections import defaultdict

### Loading dataset

In [2]:
df = pd.read_csv(os.path.expanduser('~/data/DataSample.csv'))
df

Unnamed: 0,_ID,TimeSt,Country,Province,City,Latitude,Longitude
0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.93990,-81.27090
2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.57760,-80.22010
3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.37160,-80.97730
4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.37160,-80.97730
...,...,...,...,...,...,...,...
22020,5614801,2017-06-21 12:23:07.880,CA,ON,Saint Catharines,43.16440,-79.24560
22021,5614888,2017-06-21 08:23:01.793,CA,AB,Calgary,51.02093,-114.10621
22022,5614909,2017-06-21 00:23:07.903,CA,ON,Whitby,43.88730,-78.94220
22023,5614912,2017-06-21 11:23:07.953,CA,ON,Oakville,43.49340,-79.71260


#### There is an issue with the column labels

In [3]:
df.columns

Index(['_ID', ' TimeSt', 'Country', 'Province', 'City', 'Latitude',
       'Longitude'],
      dtype='object')

**ASSUMPTION:** I cannot modify the datafiles

Assuiming I cannot modify the file itself, I will clean up the column names after I load it in. 
All that is needed here is to strip excess white space.
It would be perferable to fix it in the code that generates the file.

In [4]:
df.columns = [col.strip() for col in df.columns]

### Removal of duplicate data.

**ASSUMPSION:** One of the duplicate rows is still correct. So I only remove any duplicates of them. 

In [5]:
df_clean = df.drop_duplicates(['TimeSt', 'Latitude', 'Longitude']).reset_index()
df_clean

Unnamed: 0,index,_ID,TimeSt,Country,Province,City,Latitude,Longitude
0,0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123
1,1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.93990,-81.27090
2,2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.57760,-80.22010
3,3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.37160,-80.97730
4,4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.37160,-80.97730
...,...,...,...,...,...,...,...,...
19994,22019,5614760,2017-06-21 08:23:01.793,CA,AB,Calgary,51.02093,-114.10621
19995,22020,5614801,2017-06-21 12:23:07.880,CA,ON,Saint Catharines,43.16440,-79.24560
19996,22022,5614909,2017-06-21 00:23:07.903,CA,ON,Whitby,43.88730,-78.94220
19997,22023,5614912,2017-06-21 11:23:07.953,CA,ON,Oakville,43.49340,-79.71260


### Loading POI list

Here the column headers have a similar issue so I applied the same cleaning step

In [6]:
df_poi = pd.read_csv('~/data/POIList.csv')
df_poi.columns = [col.strip() for col in df_poi.columns]
df_poi

Unnamed: 0,POIID,Latitude,Longitude
0,POI1,53.546167,-113.485734
1,POI2,53.546167,-113.485734
2,POI3,45.521629,-73.566024
3,POI4,45.22483,-63.232729


**ASSUMPTION:** I don't have to clean the POI list.

POI1 and POI2 are the same. Since the only cleaning mentioned in the instruction refers specificaly to `DataSample.csv` I will leave this as is.

In [16]:
class Geoinfo:
    '''
    A class for encoding goeinfo with helper functions for calculating distance
    
    Attributes
    ----------
    lat : float
        latitude
    long : float
        longitude
    '''        
    def __init__(self, lat, long):
        '''
        Constructs a Geoinfo object from the latitute and longitude.
        
            Parameters:
                lat (float):           Latitude
                long (float):          Longitude
                name (str, optional):  name
        '''
        self.lat = lat
        self.long = long
        
    def to_rads(self):
        '''
        Returns the same point but converted to radians
        
            Returns:
                geoinfo_rads (Geoinfo): same points converted to radians
        '''
        return Geoinfo(radians(self.lat), radians(self.long))
        
    def dist(self, other):
        '''
        Returns the distance from this point to another point.
        
            Parameters:
                other (geoinfo): the second point
                
            Returns:
                dist (real): distance between the two points
        '''
        R = 6373.0
        p1 = self.to_rads()
        p2 = other.to_rads()
        delta_lat = p1.lat - p2.lat
        delta_long = p1.long - p2.long
        
        a = sin(delta_lat/2)**2 + cos(p1.lat) * cos(p2.lat) * sin(delta_long/2)**2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        dist = R * c
        
        return dist

### Calculating Nearest POI

**ASSUMPTION:** I can use other libraries. sk-learn was included in the docker image.

I used nearest neighbor search from sk-learn to find the closest poi. With only 4 points, this is unnecessary but this will scale to bigger datasets.

In [17]:
def metric(x, y):
    return Geoinfo(x[0], x[1]).dist(Geoinfo(y[0], y[1]))

nn = neighbors.NearestNeighbors(metric=metric)

pois = [(point.Latitude, point.Longitude) for i, point in df_poi.iterrows()]
nn.fit(pois)

In [18]:
X = df_clean[['Latitude', 'Longitude']].to_numpy()

df_clean['Nearest POI'] = [df_poi.POIID[y[0]] for y in nn.kneighbors(X, n_neighbors=1, return_distance=False)]
df_clean

Unnamed: 0,index,_ID,TimeSt,Country,Province,City,Latitude,Longitude,Nearest POI
0,0,4516516,2017-06-21 00:00:00.143,CA,ON,Waterloo,43.49347,-80.49123,POI3
1,1,4516547,2017-06-21 18:00:00.193,CA,ON,London,42.93990,-81.27090,POI3
2,2,4516550,2017-06-21 15:00:00.287,CA,ON,Guelph,43.57760,-80.22010,POI3
3,3,4516600,2017-06-21 15:00:00.307,CA,ON,Stratford,43.37160,-80.97730,POI3
4,4,4516613,2017-06-21 15:00:00.497,CA,ON,Stratford,43.37160,-80.97730,POI3
...,...,...,...,...,...,...,...,...,...
19994,22019,5614760,2017-06-21 08:23:01.793,CA,AB,Calgary,51.02093,-114.10621,POI1
19995,22020,5614801,2017-06-21 12:23:07.880,CA,ON,Saint Catharines,43.16440,-79.24560,POI3
19996,22022,5614909,2017-06-21 00:23:07.903,CA,ON,Whitby,43.88730,-78.94220,POI3
19997,22023,5614912,2017-06-21 11:23:07.953,CA,ON,Oakville,43.49340,-79.71260,POI3
