# Import Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import dgl
import torch
import scipy.sparse as sp
import torch.nn as nn
from node2vec import Node2Vec

  from .autonotebook import tqdm as notebook_tqdm


### We will try to extract the following features
- avg distance travelled per day, or avg distance between hops
- average frequence travelled per day (num locations)

In [21]:
# Import the data
df1 = pd.read_csv('../../RVF_ATX_PID_HZ-2020-07.tsv', sep='\t')
df2 = pd.read_csv('../../RVF_ATX_PID_HZ-2020-08.tsv', sep='\t')
merged_df = pd.concat([df1, df2])
# filter out certain columns
df = merged_df.drop(columns=['local_date', 'local_hour', 'gender', 'age', 'full_panel_reweighted_sag_score', 'home_zip'])

places_df = pd.read_csv('../../RVF_ATX_PID_HZ_Places_Lookup.tsv', sep='\t')
places_df = places_df[["venueid", "geolat", "geolong"]]

dwell_time_threshold = 60 # in minutes

filtered_df = df[(df['dwell'] >= dwell_time_threshold)]
print(filtered_df.head())
print(filtered_df.shape)

                                        persistentid  \
0  5903d26cdcecbd13590c8fe594de785f19b16004e19156...   
1  cc371dcb888f9ec0ab9a4ecabc4d49e51288313fb17a47...   
2  21e09ed692d56697e3c26b777a53a411cd21bed5527c5b...   
4  63164c43b459b4260a338979948ead113855ddaafb9d52...   
5  ad9adea8e7d63428e9372e0d670244e5033d4d2988e554...   

                    venueid    utc_date  utc_hour  dwell  
0  40b52f80f964a52051001fe3  2020-07-18        22     60  
1  40b52f80f964a52051001fe3  2020-07-18        22   1011  
2  40b52f80f964a52051001fe3  2020-07-25         0   1058  
4  40b52f80f964a52051001fe3  2020-07-29        18    763  
5  40b52f80f964a52051001fe3  2020-07-03        20   9208  
(1492858, 5)


In [22]:
places_data_map = dict()
for row in places_df.itertuples():
    venueId = 1
    lat = 2
    long = 3
    if row[venueId] not in places_data_map:
        places_data_map[row[venueId]] = (row[lat], row[long])

In [23]:
data_map = dict()
for row in filtered_df.itertuples():
    persistentid = 1
    venueId = 2
    utc_date = 3
    utc_hour = 4
    data = (row[utc_date], row[utc_hour], row[venueId])
    if row[persistentid] not in data_map:
        data_map[row[persistentid]] = []
    data_map[row[persistentid]].append(data)

In [24]:
for device in data_map:
    data_map[device].sort(key=lambda x: x[1])
    data_map[device].sort(key=lambda x: x[0])

In [25]:
c = 0
for device in data_map:
    c += len(data_map[device])

In [26]:
import math
def distance_between_coordinates(lat1, long1, lat2, long2):
    earth_radius = 6371
    lat1 = math.radians(lat1)
    long1 = math.radians(long1)
    lat2 = math.radians(lat2)
    long2 = math.radians(long2)

    # dist = math.acos(math.sin(lat1)*math.sin(lat2)+math.cos(lat1)*math.cos(lat2)*math.cos(long2-long1)) * earth_radius
    # print(dist)

    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = long2 - long1

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = earth_radius * c

    return distance

In [27]:
feature_map = dict()
for device in data_map:
    feature_map[device] = dict()
    avg_locations_per_day = len(data_map[device]) / 62
    feature_map[device]["avg_locations_per_day"] = avg_locations_per_day

    avg_distance_per_day = 0
    distance = 0
    places = data_map[device]
    if len(places) > 1:
        for i in range(1, len(places)):
            p1 = places[i - 1][2]
            p2 = places[i][2]
            lat1 = places_data_map[p1][0]
            long1 = places_data_map[p1][1]
            lat2 = places_data_map[p2][0]
            long2 = places_data_map[p2][1]
            distance += distance_between_coordinates(lat1, long1, lat2, long2)
    avg_distance_per_day = distance / 62
    feature_map[device]["avg_distance_per_day"] = avg_distance_per_day

print(feature_map)

{'5903d26cdcecbd13590c8fe594de785f19b16004e19156c6ec0422816251efd0': {'avg_locations_per_day': 0.46774193548387094, 'avg_distance_per_day': 1.4298607432545232}, 'cc371dcb888f9ec0ab9a4ecabc4d49e51288313fb17a47a0d9ba5d8fceb0f2e0': {'avg_locations_per_day': 0.1935483870967742, 'avg_distance_per_day': 0.9527355027825547}, '21e09ed692d56697e3c26b777a53a411cd21bed5527c5b55e76164556acdf337': {'avg_locations_per_day': 3.1129032258064515, 'avg_distance_per_day': 20.704075237185844}, '63164c43b459b4260a338979948ead113855ddaafb9d521c3e613bcbce771a49': {'avg_locations_per_day': 2.7419354838709675, 'avg_distance_per_day': 16.069862520719745}, 'ad9adea8e7d63428e9372e0d670244e5033d4d2988e5546f8801692bf9d40646': {'avg_locations_per_day': 3.338709677419355, 'avg_distance_per_day': 22.481026675206778}, '8543abe51c64e59c9c6848bbcfa557f435f81494812ce9902ef793310f546bc7': {'avg_locations_per_day': 2.9838709677419355, 'avg_distance_per_day': 8.629778124040323}, '1b7981f70fab5732bacc103b7f9fce44a18ece3d03bb9

In [28]:
import pickle
with open('node_travel_data.pkl', 'wb') as fp:
    pickle.dump(feature_map, fp)
    print('dictionary saved successfully to file')


dictionary saved successfully to file
