# Import Modules

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import dgl
import torch
import scipy.sparse as sp
import torch.nn as nn
from node2vec import Node2Vec

### We will try to extract the following features
- avg distance travelled per day, or avg distance between hops
- average frequence travelled per day (num locations)

In [9]:
# Import the data
df = pd.read_csv('../../RVF_ATX_PID_HZ-2020-07.tsv', sep='\t')
# filter out certain columns
df = df.drop(columns=['local_date', 'local_hour', 'gender', 'age', 'full_panel_reweighted_sag_score', 'home_zip'])

places_df = pd.read_csv('../../RVF_ATX_PID_HZ_Places_Lookup.tsv', sep='\t')
places_df = places_df[["venueid", "geolat", "geolong"]]

dwell_time_threshold = 60 # in minutes
start_date = pd.to_datetime('2020-07-01')
end_date = pd.to_datetime('2020-07-06')

filtered_df = df[(pd.to_datetime(df['utc_date']) >= start_date) & (pd.to_datetime(df['utc_date']) <= end_date) & (df['dwell'] >= dwell_time_threshold)]
df.head()

Unnamed: 0,persistentid,venueid,utc_date,utc_hour,dwell
0,5903d26cdcecbd13590c8fe594de785f19b16004e19156...,40b52f80f964a52051001fe3,2020-07-18,22,60
1,cc371dcb888f9ec0ab9a4ecabc4d49e51288313fb17a47...,40b52f80f964a52051001fe3,2020-07-18,22,1011
2,21e09ed692d56697e3c26b777a53a411cd21bed5527c5b...,40b52f80f964a52051001fe3,2020-07-25,0,1058
3,e170f93db0ea4930ea2c0d2167feddb1b4fe2b5412d826...,40b52f80f964a52051001fe3,2020-07-25,18,30
4,63164c43b459b4260a338979948ead113855ddaafb9d52...,40b52f80f964a52051001fe3,2020-07-29,18,763


In [10]:
places_data_map = dict()
for row in places_df.itertuples():
    venueId = 1
    lat = 2
    long = 3
    if row[venueId] not in places_data_map:
        places_data_map[row[venueId]] = (row[lat], row[long])

In [11]:
data_map = dict()
for row in filtered_df.itertuples():
    persistentid = 1
    venueId = 2
    utc_date = 3
    utc_hour = 4
    data = (row[utc_date], row[utc_hour], row[venueId])
    if row[persistentid] not in data_map:
        data_map[row[persistentid]] = []
    data_map[row[persistentid]].append(data)

In [12]:
for device in data_map:
    data_map[device].sort(key=lambda x: x[1])
    data_map[device].sort(key=lambda x: x[0])

In [13]:
import math
def distance_between_coordinates(lat1, long1, lat2, long2):
    earth_radius = 6371
    lat1 = math.radians(lat1)
    long1 = math.radians(long1)
    lat2 = math.radians(lat2)
    long2 = math.radians(long2)

    # dist = math.acos(math.sin(lat1)*math.sin(lat2)+math.cos(lat1)*math.cos(lat2)*math.cos(long2-long1)) * earth_radius
    # print(dist)

    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = long2 - long1

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = earth_radius * c

    return distance

In [14]:
feature_map = dict()
for device in data_map:
    feature_map[device] = dict()
    avg_locations_per_day = len(data_map[device]) / ((end_date - start_date).days + 1)
    feature_map[device]["avg_locations_per_day"] = avg_locations_per_day

    avg_distance_per_day = 0
    distance = 0
    places = data_map[device]
    if len(places) > 1:
        for i in range(1, len(places)):
            p1 = places[i - 1][2]
            p2 = places[i][2]
            lat1 = places_data_map[p1][0]
            long1 = places_data_map[p1][1]
            lat2 = places_data_map[p2][0]
            long2 = places_data_map[p2][1]
            distance += distance_between_coordinates(lat1, long1, lat2, long2)
    avg_distance_per_day = distance / ((end_date - start_date).days + 1)
    feature_map[device]["avg_distance_per_day"] = avg_distance_per_day

print(feature_map)

{'ad9adea8e7d63428e9372e0d670244e5033d4d2988e5546f8801692bf9d40646': {'avg_locations_per_day': 3.8333333333333335, 'avg_distance_per_day': 50.64015685771528}, 'b8ea7697c88dc1f4a1f9b9bbe60d07c304ffc1539ee80632b12d07b1e181815d': {'avg_locations_per_day': 3.0, 'avg_distance_per_day': 9.743457756452807}, 'd7891bc5540d1352f5149f902922ffa923fe1d27e06be8353fd14da80c807612': {'avg_locations_per_day': 6.833333333333333, 'avg_distance_per_day': 27.37898288577948}, 'fccee91660f332ee4b571403661ecc3aed3d29fbc8979275dde2a96c259ef2f9': {'avg_locations_per_day': 2.3333333333333335, 'avg_distance_per_day': 13.528569995725745}, 'ff8ff5da9fbb2a65d408a333fa1359616c899634f246d463b2b04ebf6dfd7b34': {'avg_locations_per_day': 2.6666666666666665, 'avg_distance_per_day': 36.05160041792916}, '4e96c76fdedb1bf0768252f1a5dec70dd503ef781e4794d0b198cbc179cf1db1': {'avg_locations_per_day': 2.0, 'avg_distance_per_day': 5.311668474504386}, '86be5ba82f0648a3a930d8370ea61c36c43a0382aabf44624c3fd63d9563b0e3': {'avg_locatio

In [15]:
import pickle
with open('node_travel_data.pkl', 'wb') as fp:
    pickle.dump(feature_map, fp)
    print('dictionary saved successfully to file')


dictionary saved successfully to file
