In [1]:
import sqlite3
import pandas as pd
import collections
import networkx as nx
import json

In [2]:
cnx = sqlite3.connect('../corona-sniffer/backend/data/database_3000.db')
df_agents = pd.read_sql_query("SELECT * FROM agents", cnx)
df_walker_contacts = pd.read_sql_query("SELECT * FROM walkers INNER JOIN contacts", cnx)

In [3]:
def trans_to_list(df_walker_contacts):
    real_ids = set(df_walker_contacts['real_id'].tolist())
    agent_id_tracks = []
    for real_id in real_ids:
        agent_id_track = df_walker_contacts.loc[df_walker_contacts['real_id'] == real_id].sort_values('time')['agent_id'].tolist()
        agent_id_tracks.append(agent_id_track)
    return agent_id_tracks

In [4]:
def get_agent_next_prob(agent_id_tracks, df_agents):
    aa_counts = collections.defaultdict(lambda: collections.Counter())
    agent_counts = collections.Counter()
    agents = set(df_agents['id'].tolist())

    for track in agent_id_tracks:
        pre_agent = track[0]
        agent_counts[pre_agent] += 1
        
        for agent in track[1:]:
            aa_counts[pre_agent][agent] += 1
            agent_counts[agent] += 1
            pre_agent = agent

#     V = sum(agent_counts.values())
#     for prev_agent in agents:
#         T = len(aa_counts[prev_agent].keys())
#         Z = V - T
#         agent_counts[prev_agent] += T
#         for agent in agents:
#             if agent not in aa_counts[prev_agent]:
#                 aa_counts[prev_agent][agent] = T / Z

    file = open('model.txt', 'w')
    file.write(json.dumps({
        'aa_counts': aa_counts,
        'agent_counts': agent_counts
    }, indent=2, sort_keys=True))
    file.close()
    
    return aa_counts, agent_counts

In [5]:
agent_id_tracks = trans_to_list(df_walker_contacts)

In [6]:
aa_counts, agent_counts = get_agent_next_prob(agent_id_tracks, df_agents)

In [8]:
cnx = sqlite3.connect('../corona-sniffer/backend/data/database_3000_test.db')

df_walks2 = pd.read_sql_query("SELECT * FROM walks JOIN walkers ON walkers.id = walks.walker_id", cnx)
df_walkers2 = pd.read_sql_query("SELECT * FROM walkers", cnx)
df_contacts2 = pd.read_sql_query("SELECT * FROM contacts", cnx)
df_agents2 = pd.read_sql_query("SELECT * FROM agents", cnx)


In [14]:
def algo(df_contacts, aa_counts, agent_counts):
    df_contacts = df_contacts.sort_values('time')
    N = len(set(df_contacts['agent_id'].tolist()))

    prev_time, prev_agent = None, None
    visited = set()
    groups = []
    
    for i, row in df_contacts.iterrows():
        prev_agent = row['agent_id']
        x, y = json.loads(row['json'])['agentPos']['x'], json.loads(row['json'])['agentPos']['y']
        walker_id = row['walker_id']
        if row['walker_id'] in visited:
            continue
            
        def getDist(col):
            return pd.Series(
                [(float(json.loads(c)['agentPos']['x']) - x) ** 2 + 
                 (float(json.loads(c)['agentPos']['x']) - y) ** 2 for c in col])

        df_considered = df_contacts.loc[(df_contacts['walker_id'] != walker_id) & df_contacts['time'] == row['time']]

        N = 3 #df_considered.shape[0]
        prob = [[None] * (N+1) for _ in range(N+1)]
        bp = [[None] * (N+1) for _ in range(N+1)]

        for t in range(1, N+1):
            s = 0
            for _, row in df_considered.iterrows():
                time, tag, walker_id = row['time'], row['agent_id'], row['walker_id']
                curr_max = -float('inf')
                curr_max_idx = 0
                
                prev_s = 0
                for _, prev_row in df_contacts.loc[(df_contacts['walker_id'] != walker_id) & df_contacts['time'] == row['time'] + 60].iterrows():
                    prev_time, prev_tag, prev_walker_id = row['time'], row['agent_id'], row['walker_id']
                    if int(time) - int(prev_time) > 120:
                        continue
                    if prev_tag == tag:
                        curr_max = 1
                        curr_max_idx = prev_s
                        break
                    tmp = (prob[t - 1][prev_s] if prob[t - 1][prev_s] else 0) \
                        * ((aa_counts[prev_tag][tag] / agent_counts[prev_tag]) if agent_counts[prev_tag] != 0 else 1e-10)
                    if tmp > curr_max:
                        curr_max = tmp
                        curr_max_idx = prev_s
                    prev_s += 1
                prob[t][s] = curr_max
                bp[t][s] = curr_max_idx
                s += 1

        prev_bp = curr_max_idx
        group = set()
        for i, bp in enumerate(reversed(bp)):
            idx = N - i - 1
#             print(prev_bp)
            group.add(df_considered.iloc[prev_bp]['walker_id'])
            visited.add(df_considered.iloc[prev_bp]['walker_id'])
            prev_bp = bp[prev_bp]
        groups.append(group)
#         print(groups)
        # agent_max_prob = max(prob[prev_agent].items(), key=operator.itemgetter(1))[0]
        # prev_time, prev_agent = row['time'], row['agent_id']

    return groups

In [15]:
def df_algo(df_contacts, aa_counts, agent_counts):
    df_contacts = df_contacts.sort_values('time')
    N = len(set(df_contacts['agent_id'].tolist()))

    prev_time, prev_agent = None, None
    visited = set()
    groups = []
    
    for i, row in df_contacts.iloc[:10].iterrows():
        prev_agent = row['agent_id']
        x, y = json.loads(row['json'])['agentPos']['x'], json.loads(row['json'])['agentPos']['y']
        if row['walker_id'] in visited:
            continue
            
        def getDist(col):
            return pd.Series(
                [(float(json.loads(c)['agentPos']['x']) - x) ** 2 + 
                 (float(json.loads(c)['agentPos']['x']) - y) ** 2 for c in col])

        N = 10 # df_considered.shape[0]
        prob = [[None] * (N+1) for _ in range(N+1)]
        bp = [[None] * (N+1) for _ in range(N+1)]

        prev_time, prev_tag, walker_id = row['time'], row['agent_id'], row['walker_id']
        for t in range(1, N+1):
#             df_considered = df_contacts.iloc[(t+1) * 10:].sort_values(by='json', key=getDist)[:10]
            df_considered = df_contacts.loc[(df_contacts['walker_id'] != walker_id) & df_contacts['time'] == row['time']]

            s = 0
            prev_s = 0
            for _, row in df_considered.iterrows():
                time, tag, walker_id = row['time'], row['agent_id'], row['walker_id']
                curr_max = -float('inf')
                curr_max_idx = 0
                
                if prev_tag == tag:
                    curr_max = 1
                    curr_max_idx = prev_s
                    break
                tmp = (prob[t - 1][prev_s] if prob[t - 1][prev_s] else 0) \
                    * ((aa_counts[prev_tag][tag] / agent_counts[prev_tag]) if agent_counts[prev_tag] != 0 else 1e-10)
                if tmp > curr_max:
                    curr_max = tmp
                    curr_max_idx = prev_s
                prev_s += 1
                prob[t][s] = curr_max
                bp[t][s] = curr_max_idx
                s += 1

        prev_bp = curr_max_idx
        group = set()
        for i, bp in enumerate(reversed(bp)):
            idx = N - i - 1
#             print(prev_bp)
            group.add(df_considered.iloc[prev_bp]['walker_id'])
            visited.add(df_considered.iloc[prev_bp]['walker_id'])
            prev_bp = bp[prev_bp]
        groups.append(group)
#         print(groups)
        # agent_max_prob = max(prob[prev_agent].items(), key=operator.itemgetter(1))[0]
        # prev_time, prev_agent = row['time'], row['agent_id']

    return groups

In [16]:
groups = algo(df_contacts2, aa_counts, agent_counts)

  return op(a, b)


UnboundLocalError: local variable 'curr_max_idx' referenced before assignment

In [103]:
groups

[{'3566e29b6eb932217994'},
 {'2d11dcbeb5b27c552f59'},
 {'aa0a6997d5025aa5c718'},
 {'5841fb441de4fb9a0751'},
 {'3566e29b6eb932217994'},
 {'e17735259a24861f9a96'},
 {'7671ae501cd1b8efb7ce'},
 {'f7a7c53fcfb1bed5b604'},
 {'ba9958bf70055161b376'},
 {'5841fb441de4fb9a0751'}]

In [104]:
walker_id_to_real_dict = {}
walker_id_graph = nx.Graph([(i, j) for group in groups for i, j in zip(list(group)[:-1], list(group)[1:])])
walker_id_cc = sorted(nx.connected_components(walker_id_graph), key=len, reverse=True)

In [105]:
walker_id_cc

[]

In [98]:
df_walks_connected = pd.DataFrame(df_walks2)
for i, component in enumerate(groups):
    new_id = i
    for c in component:
        df_walks_connected = df_walks_connected.replace(c, new_id)

In [99]:
real_ids = set(df_walks_connected['real_id'].tolist())
correct, total, tried = 0, 0, 0
for real_id in real_ids:
    id_list = df_walks_connected.loc[df_walks_connected['real_id'] == real_id].sort_values('time')['walker_id'].tolist()
    old_id_list = df_walks2.loc[df_walks2['real_id'] == real_id].sort_values('time')['walker_id'].tolist()

    tried += sum([0 if id == old_id else 1 for (id, old_id) in zip(id_list, old_id_list)])
    
    dic = collections.Counter(id_list)
    correct += max(dic.values())
    total += len(id_list)

In [100]:
cnx.execute('DROP TABLE IF EXISTS walks_attached')
df_walks_connected.to_sql('walks_attached', con=cnx)

print('correct/tried: {}/{} {}%'.format(correct, tried, correct/tried*100))
print('correct/total: {}/{} {}%'.format(correct, total, correct/total*100))
print('baseline: {}/{} {}%'.format(len(real_ids) * 2, total, len(real_ids) * 2 / total * 100))

correct/tried: 47/140 33.57142857142857%
correct/total: 47/275 17.09090909090909%
baseline: 20/275 7.2727272727272725%
