In [1]:
%load_ext autoreload
%autoreload 2
import sqlite3
import operator
import pandas as pd
import collections
import networkx as nx
import json
import math
import numpy as np
from multiprocessing import Pool
import time
utils = __import__('utils.utils', fromlist=['object'])
train = __import__('utils.train', fromlist=['object'])

In [2]:
cnx = sqlite3.connect('../corona-sniffer/backend/data/database_100-_test.db')

df_walks = pd.read_sql_query("SELECT * FROM walks JOIN walkers ON walkers.id = walks.walker_id", cnx)
df_walkers = pd.read_sql_query("SELECT * FROM walkers", cnx)
df_agents = pd.read_sql_query("SELECT * FROM agents", cnx)
df_contacts_tmp = pd.read_sql_query("SELECT * FROM contacts", cnx)
idx = df_contacts_tmp.groupby(['walker_id', 'time'])['distance'].transform(max) == df_contacts_tmp['distance']
df_contacts = df_contacts_tmp[idx]

INTERVAL = 60
TIME_PERIOD = 1
SCALE_METERS = 15000
scaleMeters = SCALE_METERS * 0.2
SIGMA_V = scaleMeters / 100
IMPOSSIBLE = 1e-10
MINUTE = 10
UPDATE_TIME = MINUTE * INTERVAL

In [3]:
id_to_pos, pos_to_id = utils.get_position(df_agents)

In [4]:
print(df_contacts_tmp.shape[0], df_contacts.shape[0])

53752 15400


In [5]:
prob_agent_id, prob_dir, prob_move = utils.get_agent_next_prob(df_walkers, df_contacts)
prob_pos = utils.map_prob_to_pos(prob_agent_id, id_to_pos)

In [6]:
"""
# agent_id -> next_agent_id
sparse data
"""
# print(collections.Counter([i for dic in prob_agent_id.values() for _, i in dic.items()]))

'\n# agent_id -> next_agent_id\nsparse data\n'

In [7]:
len(set(df_contacts['walker_id']))

1927

In [8]:
DEBUG = False
"""
Map agent id to its most possible next agent.
"""
link_list = {}
visited_walker_ids = set()

def train(walker_id1):
    posX, posY, negX, negY = [], [], [], []
    time1, time2 = utils.get_last_2_time(df_contacts, walker_id1)

    if not time1 or not time2:
        return #continue
    
    start_time = time.time()
    # in case there're multiple agents tracking the walker
    last_rows = df_contacts.loc[(df_contacts['walker_id'] == walker_id1) & (df_contacts['time'] == time1)]
    pre_rows = df_contacts.loc[(df_contacts['walker_id'] == walker_id1) & (df_contacts['time'] == time2)]
    if last_rows.shape[0] == 0 or pre_rows.shape[0] == 0:
        return #continue
        
    if DEBUG: print('walker_id1: {}, # last_rows: {}, # pre_rows: {}'.format(walker_id1, last_rows.shape, pre_rows.shape))
        
    # among all the tracked records, consider mean velocity as real velocity
    vx, vy = utils.get_mean_v(last_rows, pre_rows)
    avg_dis = math.sqrt(vx ** 2 + vy ** 2)
    if DEBUG: print('velocity: ', time.time() - start_time)
    
    start_time = time.time()
    # get candidate points by time
    candidates = df_contacts.loc[(pd.to_numeric(df_contacts["time"]) < (int(time1) + 100)) & 
                                 (pd.to_numeric(df_contacts["time"]) > (int(time1))) & 
                                 (df_contacts['walker_id'] != walker_id1)]
#     next_time = candidates['time'].min()
#     candidates = candidates.loc[candidates['time'] == next_time]
    # idx = groupby(['walker_id'])['time'].transform(lambda x: pd.to_numeric(x).min()) == pd.to_numeric(candidates['time'])
    # candidates = candidates[idx]
    if DEBUG: print(f'{candidates.shape}: ', time.time() - start_time)
    
    if candidates.shape[0] == 0:
        return #continue
    
    candidate_agent_probs = {}
    for _, row1 in last_rows.iterrows():
        start_time = time.time()
        
        agent_id1, time1 = row1['agent_id'], row1['time']
        json1 = json.loads(row1['json'])['agentPos']
    
        for _, row2 in candidates.iterrows():
            res = ''
            agent_id2, walker_id2, time2 = row2['agent_id'], row2['walker_id'], row2['time']
            json2 = json.loads(row2['json'])['agentPos']
            
            isPos, thisX = True, []
            if df_walks.loc[df_walks['walker_id'] == walker_id1].iloc[0]['real_id'] == df_walks.loc[df_walks['walker_id'] == walker_id2].iloc[0]['real_id']:
                isPos = True
            else:
                isPos = False
            
            new_prob = 0
            
            # direct
            this_prob = utils.get_direct_prob(prob_agent_id, agent_id1, agent_id2)
            new_prob += this_prob
            res += 'direct: ' + str(this_prob)
            thisX.append(this_prob)

            # distance
            timec1, timec2 = utils.get_last_2_time(df_contacts, walker_id2)
            if not timec1 or not timec2:
                vx2, vy2 = vx, vy
            else:
                last_rows2 = df_contacts.loc[(df_contacts['walker_id'] == walker_id2) & (df_contacts['time'] == timec1)]
                pre_rows2 = df_contacts.loc[(df_contacts['walker_id'] == walker_id2) & (df_contacts['time'] == timec2)]
                if last_rows2.shape[0] == 0 or pre_rows2.shape[0] == 0:
                    vx2, vy2 = vx, vy
                else:
                    vx2, vy2 = utils.get_mean_v(last_rows2, pre_rows2)
            avg_dis2 = math.sqrt(vx2 ** 2 + vy2 ** 2)
            
            this_prob = utils.get_dis_prob(json1, json2, (vx+vx2)/2, (vy+vy2)/2, prob_move[agent_id1] if agent_id1 in prob_move else None,
                                           avg_dis=(avg_dis+avg_dis2)/2, time=int(row2['time']) - int(row1['time']))
            if this_prob <= 1e-10:
                continue
            new_prob += this_prob
            thisX.append(this_prob)
            res += ', distance: ' + str(this_prob)

            # direction
            this_prob = utils.get_direction_prob(json1, json2, prob_dir, agent_id1)
            thisX.append(this_prob)
            new_prob += this_prob
            res += ', directions: ' + str(this_prob)
                
            if DEBUG: print(res)
                
            if isPos:
                posX.append(thisX)
                posY.append(1)
            else:
                negX.append(thisX)
                negY.append(0)
        if DEBUG: print(f'candidates', time.time() - start_time)
            
        return posX, posY, negX, negY
#     if i % 100 == 99: print(f'{i+1}th/{len(walker_ids)} iter: {time.time() - start_time}')

In [9]:
p = Pool(10)
walker_ids = set(df_contacts['walker_id'].tolist())
print(len(walker_ids))

1927


Process ForkPoolWorker-1:
Process ForkPoolWorker-6:
Process ForkPoolWorker-4:
Process ForkPoolWorker-10:
Process ForkPoolWorker-8:
Process ForkPoolWorker-7:
Process ForkPoolWorker-9:
Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297

KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


In [10]:
results = p.map(train, walker_ids)

In [11]:
posX, posY, negX, negY = [], [], [], []
for tmp in results:
    if tmp is None:
        continue
    tposX, tposY, tnegX, tnegY = tmp
    posX += tposX
    posY += tposY
    negX += tnegX
    negY += tnegY

In [12]:
print(np.array(posY).shape, np.array(negY).shape)

(917,) (6056,)


In [13]:
# choices = np.random.choice(np.array(posY).shape[0], np.array(negY).shape[0], replace=False)
# choices = np.random.choice(np.array(negY).shape[0], np.array(posY).shape[0], replace=False)

In [14]:
# npX = np.append(np.array(negX), np.take(np.array(posX), choices, 0), axis=0)
# npY = np.append(np.array(negY), np.take(np.array(posY), choices, 0))

# npX = np.append(np.array(posX), np.take(np.array(negX), choices, 0), axis=0)
# npY = np.append(np.array(posY), np.take(np.array(negY), choices, 0))

npX = np.append(np.array(posX), np.array(negX), axis=0)
npY = np.append(np.array(posY), np.array(negY))

In [15]:
np.set_printoptions(threshold=np.inf)

In [16]:
# with open('X.txt', 'w') as f:
#     for item in npX:
#         f.write("%s\n" % item)
# with open('Y.txt', 'w') as f:
#     for item in npY:
#         f.write("%s\n" % item)

In [17]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=500).fit(npX, npY)
clf.predict(npX[:2, :])
clf.predict_proba(npX[:2, :])
clf.score(npX, npY)

0.8684927577800086

## Predicating

In [140]:
cnx = sqlite3.connect('../corona-sniffer/backend/data/database_10_train.db')

df_walks = pd.read_sql_query("SELECT * FROM walks JOIN walkers ON walkers.id = walks.walker_id", cnx)
df_walkers = pd.read_sql_query("SELECT * FROM walkers", cnx)
df_agents = pd.read_sql_query("SELECT * FROM agents", cnx)
df_contacts_tmp = pd.read_sql_query("SELECT * FROM contacts", cnx)
idx = df_contacts_tmp.groupby(['walker_id', 'time'])['distance'].transform(min) == df_contacts_tmp['distance']
df_contacts = df_contacts_tmp[idx]

In [141]:
"""
Get counter of agent_id Key -> agent_id Values
"""
id_to_pos, pos_to_id = utils.get_position(df_agents)
prob_agent_id = utils.map_prob_to_agent_id(prob_pos, pos_to_id)

In [142]:
"""
# agent_id -> next_agent_id
sparse data
"""
print(collections.Counter([i for dic in prob_agent_id.values() for _, i in dic.items()]))

Counter({1: 1590, 2: 595, 3: 301, 4: 180, 5: 117, 6: 96, 7: 68, 8: 56, 10: 27, 11: 25, 12: 25, 9: 25, 13: 24, 14: 23, 16: 11, 18: 11, 15: 10, 20: 9, 22: 8, 17: 8, 29: 7, 19: 6, 25: 6, 27: 4, 51: 4, 39: 4, 26: 3, 37: 3, 21: 3, 32: 3, 23: 3, 62: 2, 34: 2, 41: 2, 30: 2, 24: 2, 28: 2, 40: 2, 42: 2, 46: 2, 290: 1, 52: 1, 100: 1, 31: 1, 38: 1, 162: 1, 33: 1, 54: 1, 56: 1, 379: 1, 74: 1, 131: 1, 82: 1, 36: 1, 91: 1, 48: 1, 65: 1, 324: 1, 68: 1, 70: 1, 205: 1, 336: 1, 50: 1, 59: 1, 166: 1})


In [143]:
DEBUG = False
"""
Map agent id to its most possible next agent.
"""
link_list = {}
all_times = sorted(list(int(time) for time in set(df_contacts['time'])))
first_batch_times = set()

for time in all_times:
    if time < all_times[0] + UPDATE_TIME:
        first_batch_times.add(str(time))
        
print(len(first_batch_times), first_batch_times)
print(len(set(df_contacts['walker_id'])))

78 {'1614181', '1614415', '1614572', '1614328', '1614045', '1614201', '1614364', '1614184', '1614381', '1614032', '1614028', '1614568', '1614405', '1614100', '1614340', '1614361', '1614465', '1614441', '1614175', '1614268', '1614235', '1614115', '1614541', '1614585', '1614165', '1614448', '1614604', '1614345', '1614561', '1614595', '1614088', '1614241', '1614484', '1614481', '1614021', '1614148', '1614040', '1614392', '1614601', '1614124', '1614220', '1614388', '1614160', '1614295', '1614152', '1614520', '1614064', '1614321', '1614212', '1614285', '1614301', '1614332', '1614544', '1614092', '1614452', '1614535', '1614580', '1614055', '1614061', '1614355', '1614280', '1614460', '1614261', '1614208', '1614272', '1614121', '1614508', '1614244', '1614304', '1614105', '1614525', '1614141', '1614475', '1614501', '1614400', '1614081', '1614424', '1614225'}
209


In [144]:
walker_ids = set(df_contacts['walker_id'].tolist())

# for walker_id in walker_ids:
#     last_time = df_contacts.loc[df_contacts['walker_id'] == walker_id].sort_values('time').iloc[-1]['time']
#     candidates = df_contacts.loc[(df_contacts['walker_id'] != walker_id) &
#                                  (df_contacts['time'] == str(int(last_time) + INTERVAL))]
#     if candidates.shape[0] == 1:
#         link_list[walker_id] = candidates.iloc[-1]['walker_id']

In [145]:
# """
# Connect each linked path into a single path
# """
# graph = nx.Graph([(i, j) for i, j in link_list.items() if i and j])
# connected_components = sorted(nx.connected_components(graph), key=len, reverse=True)
# print(len(connected_components))
# # print(connected_components)

# tried = 0
# for i, component in enumerate(connected_components):
#     new_id = i
#     time_set = set()
#     for c in component:
#         rows = df_contacts.loc[df_contacts['walker_id'] == c]
            
#         tried += df_contacts.loc[df_contacts['walker_id'] == c].shape[0]
#         df_contacts = df_contacts.replace({'walker_id': {c: str(new_id) + 'tmp'}})
        

In [146]:
len(set(df_contacts['walker_id']))

209

In [147]:
first_batch_walkers = set(df_contacts.loc[df_contacts['time'].isin(first_batch_times)]['walker_id'])
print(len(first_batch_walkers), len(set(df_contacts['walker_id'])))

17 209


In [148]:
walker_ids = set(df_contacts['walker_id'])
batched_walker_ids = collections.defaultdict(list)
for walker_id in walker_ids:
    time = df_contacts.loc[df_contacts['walker_id'] == walker_id].sort_values('time').iloc[-1]['time']
    batched_walker_ids[math.floor((int(time) - all_times[0]) / UPDATE_TIME)].append(walker_id)
print(dict(batched_walker_ids))

{4: ['phTBKLHHZzkknnkiHNMw', 'IxwuxUWvLxxPnMsHDjun', 'ZGdSoHhdAyVULXBvEyHo', 'fpxrXCdUYvwjUwNHSdpw', 'XCvxVRpVosAILleFRuTv', 'KkwyMgvUoQGYINfrNCDZ', 'jvkwsVpbHAsIyRxikNzn', 'YwZHZWpcYkZxvssPzRDS', 'jlSOedOEHoCnONanEnxC'], 9: ['jkryylTzRzSlVxALfacJ', 'KqfjlLhGZjASYFfyhtXg', 'UYggihPiaxgKPvQHVTHE', 'EnCpKcWGoROedDPWGKQr', 'ZkoYLnMsCrXjDJYTlktd', 'AYNPmlhRlZxiVLholYNb', 'WRZiTzEejNxvLXIbjzkW', 'FVgZViNgLYTDVvUrysoY', 'bpVRPeeGLkacRMwJvaBE', 'MarLOZuMNPZNeWjQZKFP'], 14: ['ZBguRLghsVPchXDHcADz', 'rlZaUQmfRBeSByyaSGjH', 'UxiTPXkyxqbrCYxYVEDt', 'SWrWgRegRqCsUqDgvnTS', 'SamagrasQvAfSmGSFSbf', 'KoPBPtFGBJwBwzKIYvCI'], 73: ['VYRjUkcdvFwhpzMydqMi'], 30: ['xgqLFkZXrIcRpYTSkWYj', 'yMdYZqsYCwfCnxzoFUgm', 'nBNhNjGqdpZFYdeyduZl'], 1: ['qwMelPSUlliiuhRkcikA', 'ftbVlmkQUiqwZyiLlZzx', 'bySKJtOuDLDWUhNDaYEb', 'mzRGkkqMnxCokMEiokKs', 'atoURtvMuXxbLkAuxlUw', 'AdcMjFQsYiLoncrIjatQ', 'SyXdJDtUvZEuzyrtBaHq', 'dYskcEAZiAmwkafmHVat', 'KNoBiJnComEYYbFxiUwh', 'aTUqKCesJkeDxnrMCTXv', 'cONyhZNLiCzTnOEJKjWp'], 10: ['

In [149]:
DEBUG = False
"""
Map agent id to its most possible next agent.
"""
batched_walker_ids = [first_batch_walkers]

for walker_ids in batched_walker_ids:
    print(walker_ids)
    next_batch_ids = set()
    for walker_id1 in walker_ids:
        time1, time2 = utils.get_last_2_time(df_contacts, walker_id1)
        if not time1 or not time:
            continue
        last_rows = df_contacts.loc[(df_contacts['walker_id'] == walker_id1) & (df_contacts['time'] == time1)]
        pre_rows = df_contacts.loc[(df_contacts['walker_id'] == walker_id1) & (df_contacts['time'] == time2)]
        if last_rows.shape[0] == 0 or pre_rows.shape[0] == 0:
            continue
        
        # among all the tracked records, consider mean velocity as real velocity
        vx, vy = utils.get_mean_v(last_rows, pre_rows)
        avg_dis = math.sqrt(vx ** 2 + vy ** 2)

        # get candidate points by time
        candidates = df_contacts.loc[(pd.to_numeric(df_contacts["time"]) == (int(time1) + INTERVAL))
                                     & (pd.to_numeric(df_contacts["time"]) > (int(time1))) 
                                     & (df_contacts['walker_id'] != walker_id1)]
#         idx = candidates.groupby(['walker_id'])['time'].transform(lambda x: pd.to_numeric(x).min()) == pd.to_numeric(candidates['time'])
#         candidates = candidates[idx]

        if candidates.shape[0] == 0:
            print('no candidates')
            continue
            
        if candidates.shape[0] == 1:
            link_list[walker_id] = candidates.iloc[-1]['walker_id']

        candidate_agent_probs = {}
        
        for _, row1 in last_rows.iterrows():
            agent_id1, time1 = row1['agent_id'], row1['time']
            json1 = json.loads(row1['json'])['agentPos']

            for _, row2 in candidates.iterrows():
                agent_id2, walker_id2 = row2['agent_id'], row2['walker_id']
                json2 = json.loads(row2['json'])['agentPos']

                prob = utils.get_prob(df_contacts, prob_agent_id, agent_id1, agent_id2, walker_id2, 
                                      json1, json2, prob_dir, vx, vy, prob_move, int(row2['time']) - int(row1['time']))
                next_batch_ids.add(walker_id2)
                # a walker's probability to be the next step is the average probability
                weight = json.loads(row2['json'])['distance']
                prob_y = clf.predict_proba(np.array([prob])[:1, :])[0][1]
                candidate_agent_probs[walker_id2] = utils.get_avg_prob(candidate_agent_probs, walker_id2, prob_y, weight)

#         print(walker_id1, candidate_agent_probs)
        
        for k in candidate_agent_probs.keys():
            if k in visited_walker_ids:
                candidate_agent_probs[k] = (0,0)
        correct_walker_id = max(candidate_agent_probs.items(), key=operator.itemgetter(1))[0] if candidate_agent_probs.items() else 0
        if correct_walker_id in visited_walker_ids:
            continue
        link_list[walker_id1] = correct_walker_id
        visited_walker_ids.add(correct_walker_id)
    print(len(next_batch_ids), len(walker_ids))
    if len(next_batch_ids) == 0:
        break
    batched_walker_ids.append(next_batch_ids)
    

{'KNoBiJnComEYYbFxiUwh', 'mzRGkkqMnxCokMEiokKs', 'cBbbkHLTiCzgvtyJwVMA', 'AdcMjFQsYiLoncrIjatQ', 'XAmtKtWuFipRbQEgMTwa', 'TbZvwmHBYRwEtRMhmIvG', 'XkaNPsExXatPmwRGibbi', 'bySKJtOuDLDWUhNDaYEb', 'zukhKRVqSeAfhDVPAudi', 'nGjZkjrvjzhaZQrZswqT', 'sxhJXCuXpeAdzGzDxiuj', 'THwcumhArxyIJIngIOme', 'ftbVlmkQUiqwZyiLlZzx', 'atoURtvMuXxbLkAuxlUw', 'DvPiPxEjxWNArQPaYHHr', 'cONyhZNLiCzTnOEJKjWp', 'oxwOZTolpZJLzhvYNKcJ'}
no candidates
14 17
{'KNoBiJnComEYYbFxiUwh', 'ZpgLnYPoLHlHNSTEJFDB', 'ZaUxskGfuDBZzKpjoLfJ', 'AdcMjFQsYiLoncrIjatQ', 'wkaKuqWbSYAPzSCfrslo', 'bySKJtOuDLDWUhNDaYEb', 'dYskcEAZiAmwkafmHVat', 'sxhJXCuXpeAdzGzDxiuj', 'ZLbsBpjSlNTQGIUFsNCR', 'ftbVlmkQUiqwZyiLlZzx', 'atoURtvMuXxbLkAuxlUw', 'zCcrBcuxoWegiNRiAVAQ', 'cONyhZNLiCzTnOEJKjWp', 'SyXdJDtUvZEuzyrtBaHq'}
no candidates
no candidates
12 14
{'ZpgLnYPoLHlHNSTEJFDB', 'nBbxVxdPbljOkvIuFSuT', 'wkaKuqWbSYAPzSCfrslo', 'BJvoZJCMIctEraOtomep', 'LWlFJBJIvHpoFHkEgDzp', 'dYskcEAZiAmwkafmHVat', 'ZLbsBpjSlNTQGIUFsNCR', 'oYiWVbCesJkDCAPAikoP', 'zCcrBc

In [150]:
"""
Connect each linked path into a single path
"""
graph = nx.Graph([(i, j) for i, j in link_list.items() if i and j])
connected_components = sorted(nx.connected_components(graph), key=len, reverse=True)
print(len(connected_components))
print(connected_components)

tried = 0
df_walks_connected = pd.DataFrame(df_walks)
for i, component in enumerate(connected_components):
    new_id = i
    time_set = set()
    for c in component:
        rows = df_walks_connected.loc[df_walks_connected['walker_id'] == c]
        
#         cyclic = False
#         for time in rows['walk_time'].tolist():
#             time_set.add(time)
            
        tried += df_walks_connected.loc[df_walks_connected['walker_id'] == c].shape[0]
        df_walks_connected = df_walks_connected.replace({'walker_id': {c: str(new_id) + 'res'}})
        

8
[{'KNoBiJnComEYYbFxiUwh', 'VYgdSVoYsBjNxuTdLlWe', 'PgyXjrdGCCZnvOWNkHYN', 'BJvoZJCMIctEraOtomep', 'JEcMdlINaHKWsVkfawYG', 'pujTDcQkpLHjcVAkroee', 'yKEKlDmMpGaDqvtSSoQN', 'ZLbsBpjSlNTQGIUFsNCR', 'DvPiPxEjxWNArQPaYHHr', 'ZkoYLnMsCrXjDJYTlktd', 'oxwOZTolpZJLzhvYNKcJ', 'jlSOedOEHoCnONanEnxC', 'jbRstbKJPIpyMULOYAzO', 'XIalxIgLlEVemwZaLsLp'}, {'nBbxVxdPbljOkvIuFSuT', 'qnvoydcQxMIQtczMaYgI', 'bySKJtOuDLDWUhNDaYEb', 'nGjZkjrvjzhaZQrZswqT', 'BiOefNUxnAfTahPBstVC', 'WRZiTzEejNxvLXIbjzkW', 'FOpMLKbcXQSvQOnOuuWi', 'YwZHZWpcYkZxvssPzRDS', 'xQwtTdsXLqFznmeGnWGl', 'zCcrBcuxoWegiNRiAVAQ', 'dBzEkRihatfNOzgWQmAc'}, {'cBbbkHLTiCzgvtyJwVMA', 'wkaKuqWbSYAPzSCfrslo', 'CFSEzzuuvFDUNzTFSmLg', 'XCvxVRpVosAILleFRuTv', 'DYnlrSoDLqwXgcZMBCfb', 'DjyzBkAqRtZGxEZIlqxn', 'atoURtvMuXxbLkAuxlUw', 'mhshGFdWOnWwhmQFwGqP', 'rRyzIyKLjpeKJTiCMdBV'}, {'XAmtKtWuFipRbQEgMTwa', 'aLDswQLDxvdbapdMucMV', 'oYiWVbCesJkDCAPAikoP', 'cONyhZNLiCzTnOEJKjWp', 'SyXdJDtUvZEuzyrtBaHq'}, {'LWlFJBJIvHpoFHkEgDzp', 'mzRGkkqMnxCokMEiokKs', 'ZaU

In [151]:
"""
How much percentage of the grouped ids are really from one group
"""

DEBUG = True
walker_ids = set(df_walks_connected['walker_id'].tolist())
correct, total = 0, 0
if DEBUG: print(len(walker_ids))
    
for walker_id in walker_ids:
    id_list = df_walks_connected.loc[df_walks_connected['walker_id'] == walker_id].sort_values('walk_time')['real_id'].tolist()
    dic = collections.Counter(id_list)
    
    l = max(dic.values())
    key = [k for k,v in dic.items() if v == l]
    
    correct += l
    total += sum(dic.values())

# print(f'correct/tried: {correct}/{tried} {correct/tried*100}%') 
print(f'correct/total: {correct}/{total} {correct/total*100}%')

195
correct/total: 1755/1771 99.09655561829474%


In [152]:
"""
How much percentage of one group is correctly grouped together (only consider the largest sub-group of grouped ids)
"""

DEBUG = False
real_ids = set(df_walks_connected['real_id'].tolist())
correct, total = 0, 0
for real_id in real_ids:
    id_list = df_walks_connected.loc[df_walks_connected['real_id'] == real_id].sort_values('walk_time')['walker_id'].tolist()
    dic = collections.Counter(id_list)
    if DEBUG: print(id_list)
    
    l = max(dic.values())
    key = [k for k,v in dic.items() if v == l]
    
    correct += l
    total += sum([v for v in dic.values()])
#     df_walks_connected = df_walks_connected.replace({'walker_id': {key[0]: real_id}})
# print(f'base: {total - (150 / 10 - 1) * 10}/{total} {(total - (150 / 10 - 1) * 10)/total*100}')
# print(f'correct/tried: {correct}/{tried} {correct/tried*100}%') 
print(f'correct/total: {correct}/{total} {correct/total*100}%')

correct/total: 441/1771 24.90118577075099%


In [153]:
cnx.execute('DROP TABLE IF EXISTS walks_attached')
df_walks_connected.to_sql('walks_attached', con=cnx)

## Compute Other Info

In [32]:
id = '1tmp'
path = df_walks_connected.loc[df_walks_connected['walker_id'] == id].sort_values('walk_time')

In [33]:
for (indx1,row1),(indx2,row2) in zip(path[:-1].iterrows(),path[1:].iterrows()):
    jsondict1, jsondict2 = json.loads(row1['json']), json.loads(row2['json'])
    agent_id1, agent_id2 = pos_to_id[(jsondict1['position']['x'], jsondict1['position']['y'])], 
        pos_to_id[(jsondict2['position']['x'], jsondict2['position']['y'])]
    

IndentationError: unexpected indent (<ipython-input-33-e4722ff0a3eb>, line 4)