# Import Modules

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

In [80]:
# Import the data
df = pd.read_csv('../RVF_ATX_PID_HZ-2020-07.tsv', sep='\t')
# filter out certain columns
df = df.drop(columns=['local_date', 'local_hour', 'gender', 'age', 'full_panel_reweighted_sag_score', 'home_zip'])
df.head()

Unnamed: 0,persistentid,venueid,utc_date,utc_hour,dwell
0,5903d26cdcecbd13590c8fe594de785f19b16004e19156...,40b52f80f964a52051001fe3,2020-07-18,22,60
1,cc371dcb888f9ec0ab9a4ecabc4d49e51288313fb17a47...,40b52f80f964a52051001fe3,2020-07-18,22,1011
2,21e09ed692d56697e3c26b777a53a411cd21bed5527c5b...,40b52f80f964a52051001fe3,2020-07-25,0,1058
3,e170f93db0ea4930ea2c0d2167feddb1b4fe2b5412d826...,40b52f80f964a52051001fe3,2020-07-25,18,30
4,63164c43b459b4260a338979948ead113855ddaafb9d52...,40b52f80f964a52051001fe3,2020-07-29,18,763


In [81]:
# filter out rows that are not within a range

# Define the date range
start_date = pd.to_datetime('2020-07-01')
end_date = pd.to_datetime('2020-07-05')
dwell_time_threshold = 60 # in minutes

# Filter rows within the date range
filtered_df = df[(pd.to_datetime(df['utc_date']) >= start_date) & (pd.to_datetime(df['utc_date']) <= end_date) & (df['dwell'] >= dwell_time_threshold)]
# add a new column to dataframe that represents minutes
filtered_df['formatted_utc_hour'] = filtered_df['utc_hour'].astype(int)
filtered_df['arrival_time'] = pd.to_datetime(filtered_df['utc_date']) + pd.to_timedelta(filtered_df['formatted_utc_hour'], unit='h')

filtered_df['departure_time'] = filtered_df['arrival_time'] + pd.to_timedelta(filtered_df['dwell'], unit='m')
print(filtered_df.head())

                                         persistentid  \
5   ad9adea8e7d63428e9372e0d670244e5033d4d2988e554...   
10  b8ea7697c88dc1f4a1f9b9bbe60d07c304ffc1539ee806...   
14  d7891bc5540d1352f5149f902922ffa923fe1d27e06be8...   
17  fccee91660f332ee4b571403661ecc3aed3d29fbc89792...   
18  ff8ff5da9fbb2a65d408a333fa1359616c899634f246d4...   

                     venueid    utc_date  utc_hour  dwell  formatted_utc_hour  \
5   40b52f80f964a52051001fe3  2020-07-03        20   9208                  20   
10  40b52f80f964a52051001fe3  2020-07-04         0    112                   0   
14  440fec14f964a520aa301fe3  2020-07-03        23   7085                  23   
17  506b1adae4b0bdc21bb2ab40  2020-07-02        14   2246                  14   
18  506b1adae4b0bdc21bb2ab40  2020-07-03        19   2289                  19   

          arrival_time      departure_time  
5  2020-07-03 20:00:00 2020-07-10 05:28:00  
10 2020-07-04 00:00:00 2020-07-04 01:52:00  
14 2020-07-03 23:00:00 2020-07-08 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['formatted_utc_hour'] = filtered_df['utc_hour'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['arrival_time'] = pd.to_datetime(filtered_df['utc_date']) + pd.to_timedelta(filtered_df['formatted_utc_hour'], unit='h')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [82]:
# add all devices as nodes to Graph
# get all devices
devices = filtered_df['persistentid'].unique()
G = nx.Graph()
for device in devices:
    # add device as a node to G
    G.add_node(device)
    

In [84]:
# sort the dataframe by arrival time
filtered_df = filtered_df.sort_values(by=['arrival_time'])
for i, row_i in filtered_df.iterrows():
    for j, row_j in filtered_df.iterrows():
        if i != j:
            # check if the two rows are within 30 minutes of each other
            first = (row_i['arrival_time'], row_i['departure_time'])
            second = (row_j['arrival_time'], row_j['departure_time'])
            # check if the two time intervals overlap, by atleast 60 minutes
            difference = second[0] - first[1]
            if difference.seconds >= 3600:
                # add edge between the two nodes
                G.add_edge(row_i['persistentid'], row_j['persistentid'])
nx.write_gpickle(G, "2020-07-01__to__2020-07-05.gpickle")

KeyboardInterrupt: 

In [None]:
nx.draw(G, with_labels=True)

In [78]:
# # connecting edges between every device if they were at the same location for a specified amount of dwell time
# groups = dict()
# # df['combined_data'] = df['venueid'].astype(str) + df['utc_date'].astype(str) + df['utc_hour'].astype(str)
# filtered_df['index'] = filtered_df.index
# grouped_df = filtered_df.groupby(['venueid', 'utc_date', 'utc_hour'])['index'].apply(list).reset_index(name='group')
# grouped_df.head()

In [77]:
# for e in grouped_df['group']:
#     if len(e) > 1:
#         for n1 in e:
#             for n2 in e:
#                 if n1 == n2: continue
#                 G.add_edge(n1, n2)

In [28]:
# nx.draw(G, with_labels=True)