### Sense360 Coding Challenge
Submitted by: Aditya Shirode (avshirod@ncsu.edu)  
Created on: 03/11/2018

In [1]:
import time
import random
import logging
import datetime

import pandas as pd

In [18]:
DATEFORMAT = "%m/%d/%Y %H:%M:%S"

In [2]:
def generate_visit_id():
    """ A generator to generate visit id """
    start_id = 0
    while True:
        start_id += 1
        yield start_id

In [3]:
new_visit_id = generate_visit_id()

In [22]:
class Visit:
    """
    A "visit" is any place a user travels to
    
    latitude: float (e.g. 45.12345)
    longitude: float (e.g. -118.12345)
    arrival_time_local: datetime (e.g. 5/30/2015 10:12:35)
    departure_time_local: datetime (e.g. 5/30/2015 18:12:35)
    """
    def __init__(self, lat, lon, arrival_time, departure_time):
        self.id = next(new_visit_id)
        self.lat = lat
        self.lon = lon
        self.arrival_time = self.parse_time(arrival_time)
        self.departure_time = self.parse_time(departure_time)
        self.time_spent = self.departure_time - self.arrival_time
    
    def parse_time(self, t):
        if type(t) is datetime.datetime:
            return t
        else:
            try:
                return datetime.datetime.strptime(t, DATEFORMAT)
            except ValueError:
                # Try other possible formats, generic parsing, etc.
                return None
    
    def get_time_spent(self):
        return self.time_spent.seconds/3600
    
    def is_visit_within_valid_period(self):
        """ Check if given visit overlaps with a time period between 8pm and 8am """
        date_visit_start = self.arrival_time.date()
        valid_period_start = datetime.datetime.combine(date_visit_start, datetime.time(8))
        valid_period_end = valid_period_start + datetime.timedelta(hours=12)
        return not (valid_period_start <= self.arrival_time < self.departure_time <= valid_period_end)
        
    def __repr__(self):
        return "Was at ({lat}, {lon}) between [{start} -- {end}]".format(
            lat=self.lat, lon=self.lon, start=self.arrival_time, end=self.departure_time
        )

In [5]:
possible_coordinates = (
    (45.12345, -118.12345),
    (40.12345, -110.12345),
    (40.12345, 110.12345),
    (45.12345, 118.12345),
)

In [6]:
def random_date(start, end):
    """
    This function will return a random datetime between two dates
    """
    if type(start) != datetime.datetime:
        start = datetime.datetime.strptime(start, DATEFORMAT)
    if type(end) != datetime.datetime:
        end = datetime.datetime.strptime(end, DATEFORMAT)
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds  # Find out n_seconds between start and end
    try:
        random_second = random.randrange(int_delta)  # Pick a random second between those
    except ValueError:
        print(start, end, int_delta)
    return start + datetime.timedelta(seconds=random_second)

In [7]:
def generate_random_date():
    """ Generate a random date for our data """
    period_start = "1/1/2017 00:00:00"
    period_end = "1/8/2017 11:59:59"
    while True:
        yield random_date(period_start, period_end)

get_random_date = generate_random_date()

In [8]:
def generate_random_visit():
    lat, lon = random.choice(possible_coordinates)
    start, end = next(get_random_date), next(get_random_date)
    while start > end:
        start, end = next(get_random_date), next(get_random_date)
    return Visit(lat, lon, start, end)

def generate_random_visit_between(start_period, end_period):
    lat, lon = random.choice(possible_coordinates)
    start_visit, end_visit = random_date(start_period, end_period), random_date(start_period, end_period)
    while start_visit >= end_visit:
        start_visit, end_visit = random_date(start_period, end_period), random_date(start_period, end_period)
    return Visit(lat, lon, start_visit, end_visit)

In [9]:
def generate_consecutive_random_visits(start, end, n_visits=10):
    visits = []
    DATEFORMAT = "%m/%d/%Y %H:%M:%S"
    if type(start) != datetime.datetime:
        start = datetime.datetime.strptime(start, DATEFORMAT)
    if type(end) != datetime.datetime:
        end = datetime.datetime.strptime(end, DATEFORMAT)
    delta = end - start
    delta_period = delta.total_seconds() / n_visits
    end = start + datetime.timedelta(seconds=delta_period)
    for i in range(n_visits):
        visits.append(generate_random_visit_between(start, end))
        start = visits[-1].departure_time
        end = end + datetime.timedelta(seconds=delta_period)
    return visits

In [10]:
start = "1/1/2017 00:00:00"
end = "1/15/2017 00:00:00"
# random_visits = [generate_random_visit() for _ in range(10)]
random_visits = generate_consecutive_random_visits(start, end)

Assumptions for identifying home:  
- all visit windows are non-overlapping
- all values in the fields are valid and well-formed

Based on these, to identify possible home location, we will -  
1. Sort values based on arrival time at a location
2. Identify locations visited between 8pm and 8am and count number of minutes spent there
3. Filter this list for values greater than 30 hours, sort and return top answer if exists

In [11]:
def identify_home(visits):
    """ Takes in a list of "visits" and returns the possible home location
    :type visits: list of Visit objects
    :rtype: (lat, lon) for predicted home location
    """
    # Get visits within our period in question (8pm -- 8am)
    valid_visits = [vars(v) for v in visits if v.is_visit_within_valid_period()]
    
    # Convert to a dataframe for easy calculations
    df = pd.DataFrame(valid_visits)
    
    # Calculate the time spent for visit in seconds
    df['time_spent_seconds'] = df['time_spent'].apply(lambda d: d.total_seconds())
    
    # Consider precise location co-ordinates up to 3 decimal points
    df[['lat', 'lon']] = df[['lat', 'lon']].applymap(lambda x: '{:.3f}'.format(x))
    
    # Group visits by location and calculate total time spent
    time_spent_by_location = df.groupby(['lat', 'lon'])[['time_spent_seconds']].sum()
    
    # The threshold is set to 30 hours; visits by location above this threshold should be considered
    time_spent_threshold = 30 * 60 * 60  # seconds
    
    # Get locations that cross the threshold
    possible_locations = time_spent_by_location[time_spent_by_location['time_spent_seconds'] > time_spent_threshold]
        
    # If no location with more than 30 hours spent during the requisite period, return -1
    if possible_locations.empty:
        logging.info("No possible home location found")
        return (-1, -1)
    
    # Get the location most time was spent at
    possible_home_location = possible_locations.nlargest(1, 'time_spent_seconds')
    
    return possible_home_location.index.tolist()[0]

In [12]:
start = "1/1/2017 00:00:00"
end = "12/31/2017 11:59:59"
random_visits = generate_consecutive_random_visits(start, end, 1000000)

In [13]:
%time identify_home(random_visits)

Wall time: 10.2 s


(45.12345, -118.12345)

In [14]:
%timeit identify_home(random_visits)

8.98 s ± 127 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
# %prun identify_home(random_visits)

---------------

`identify_home()` is the main function for this assignment  

It takes input of a list of 'Visit' class objects  
e.g. A visit with following details:  
- latitude: float (e.g. 45.12345)  
- longitude: float (e.g. -118.12345)  
- arrival_time_local: datetime (e.g. 5/30/2015 10:12:35)  
- departure_time_local: datetime (e.g. 5/30/2015 18:12:35)  

would be created as `Visit(lat=45.12345, lon=-118.12345, arrival_time="5/30/2015 10:12:35", departure_time="5/30/2015 18:12:35")`

Assumptions for identifying home:  
- all visit windows are non-overlapping
- all values in the fields are valid and well-formed

Based on these, to identify possible home location, we will -  
1. Filter visits between 8pm and 8am
2. Count number of minutes spent at locations
3. Filter this list for values greater than 30 hours, sort and return top answer if exists (otherwise return `(-1, -1)`)

This function was benchmarked against random, non-overlapping, consecutive visit data generated by `generate_consecutive_random_visits()` function above in the notebook  

We generated about **1 million** records, randomly assigning 4 co-ordinates to them, and tried to identify the Home location  
It took **~9 sec** for the function to return the answer over *10 runs*

This function can be optimized to get better results as follows:
- We can add additional columns for the visit data while creating the objects (parsing and saving the data in real life), such as is_within_valid_home_period that will ease our filtering time. Same thing with lat-lon precision.  
- Given more time, the pandas operations can be vectorized to reduce the time.