In [None]:
import numpy as np
import cv2
import glob
from matplotlib import pyplot as plt
import os
import pandas as pd

import seaborn as sns
from mpl_toolkits import mplot3d
from shapely.geometry import Point, Polygon, MultiPoint, LineString
from scipy.spatial import ConvexHull, convex_hull_plot_2d
from itertools import combinations

from datetime import datetime, timezone, tzinfo
import geopandas as gpd
from tqdm import tqdm
import random
import networkx as nx
import itertools
from fiona.crs import from_epsg
from sklearn.cluster import DBSCAN
from matplotlib.font_manager import FontProperties
import networkx as nx

from networkx import edge_betweenness_centrality as betweenness
from networkx.algorithms.community.centrality import girvan_newman
import json

font = FontProperties()
font.set_family('serif')
font.set_name('Times New Roman')
pd.options.display.max_columns = None

sns.set(style="white")
sns.color_palette("hls", 4)
sns.set_context("paper", rc={"font.size":10,"axes.titlesize":12,"axes.labelsize":12})
from shapely.geometry import Point
globalcrs = "EPSG:3857"

In [None]:
result_folder_ls = {
    "Bryant Park":"bryan_park_1980",
    "Chestnut Street":"chestnut_street_1980",
    "Downtown Crossing":"downtown_crossing_1980",
    "MET":"met_1980"
}

# load result path
gcloudapi = "AIzaSyCohhLdvyTC0UsGriQ9j-rU8pRln5wVVG8"
serviceaccount = "/Users/yuan/Dropbox (Personal)/personal files/ssh/google_drive_personal.json"
import gspread
# from oauth2client.service_account import ServiceAccountCredentials
gc = gspread.service_account(filename = serviceaccount)


def read_url(url, SHEET_NAME):
    SHEET_ID = url.split('/')[5]
    spreadsheet = gc.open_by_key(SHEET_ID)
    worksheet = spreadsheet.worksheet(SHEET_NAME)
    rows = worksheet.get_all_records()
    df_spread = pd.DataFrame(rows)
    return df_spread, worksheet

url = "https://docs.google.com/spreadsheets/d/1djLf9Uhh1zJpPBiSyjTnZ_EkkP1uZf2L8Rg8XWmXKlY/edit?usp=sharing"
SHEETNAME = "P1_historical_videos"
video_path, other_worksheet = read_url( url, SHEETNAME)

video_path

In [None]:
def generatecluster(df, dis, epsg):
    """
    This function go through each frame and run DBscan based on different Distance
    threshod

    """
    predlist = []
    for f in tqdm(df['frame_id'].unique()):
        preDF = df[df['frame_id']==f].reset_index(drop = True)
        X = preDF[[f"x_{epsg}", f"y_{epsg}"]].values

        # eps : maximum distance between two samples
        # Here we use pixel distance for ease of visualization
        # 12 pixel for 0.5m, given hunman to human interaction distance maximum as 1.2 meter
        # https://en.wikipedia.org/wiki/Proxemics#:~:text=Hall%20described%20the%20interpersonal%20distances,and%20(4)%20public%20space.

        clustering = DBSCAN(eps = dis, min_samples = 2).fit(X)
        pred = clustering.labels_
        predlist.append(pred)
    
    allpred = np.concatenate(predlist, axis=0)
    return allpred

def getbasics(file_path):
    video = cv2.VideoCapture(file_path)
    fps = video.get(cv2.CAP_PROP_FPS)
    length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    print('frames per second =',fps)
    size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    print('frames size =',size)
    # video.release()
    return video, fps, size, length

def generate_social(traceGDF, epsg, dis = 1.5):
    # dislist = [1.5, 2]
    # distlistdict = {
    #     2263: [1.2, 3.6], # feet
    #     3857: [0.6, 1.5] # meter
    # }
    """
    d is the distance threshold for DBscan at meter
    """

    clusterlabel = generatecluster(traceGDF, dis, epsg)
    FPre = pd.DataFrame(clusterlabel, columns = ['Social'])
    DBcluster = pd.concat([traceGDF, FPre], axis = 1)


    DBcluster['frame_id'] = DBcluster['frame_id'].astype(int)
    # Drop those clusterlabel == -1, create a spatial cluster id
    DBcluster['group_id_social'] = DBcluster['frame_id'].astype(str) + '_' +DBcluster['Social'].astype(str)
    DBcluster['group_id_social'] = np.where(DBcluster['Social']==-1, np.nan, DBcluster['group_id_social'])
    DBSocial = DBcluster[DBcluster['Social']!=-1].reset_index(drop = True)
    # os.makedirs(outfolder+"/step1_dbscan")
    DBSocial['group_id_social'] = DBSocial['group_id_social'].astype(str)
    return DBSocial, DBcluster



def valid_link(DBSocial, x,y, thred = 0.5, n = 2):
    samplegroup = DBSocial[DBSocial['track_id'].isin([x,y])]
    # calculate the speed_x, speed_y correlation between track 10 and 11
    df_wide = samplegroup.pivot(index = 'frame_id', 
                                columns = 'track_id', 
                                values = [f'speed_x_{n}s', 
                                          f'speed_y_{n}s',
                                          f'speed_{n}s']).reset_index()
    # calculate the correlation
    df_wide = df_wide.dropna()
    
    coor1 = df_wide[(f"speed_x_{n}s",x)].corr(df_wide[(f"speed_x_{n}s",y)])
    coor2 = df_wide[(f"speed_y_{n}s",x)].corr(df_wide[(f"speed_y_{n}s",y)])
    coor3 = df_wide[(f"speed_{n}s",x)].corr(df_wide[(f"speed_{n}s",y)])
    if coor1>thred and coor2>thred and coor3>thred:
        return True
    else:
        return False
    
def valid_link_corr(DBSocial, x,y, thred = 0.5, n = 2):
    samplegroup = DBSocial[DBSocial['track_id'].isin([x,y])]
    # calculate the speed_x, speed_y correlation between track 10 and 11
    df_wide = samplegroup.pivot(index = 'frame_id', 
                                columns = 'track_id', 
                                values = [f'speed_x_{n}s', 
                                          f'speed_y_{n}s',
                                          f'speed_{n}s']).reset_index()
    # calculate the correlation
    df_wide = df_wide.dropna()
    
    coor1 = df_wide[(f"speed_x_{n}s",x)].corr(df_wide[(f"speed_x_{n}s",y)])
    coor2 = df_wide[(f"speed_y_{n}s",x)].corr(df_wide[(f"speed_y_{n}s",y)])
    coor3 = df_wide[(f"speed_{n}s",x)].corr(df_wide[(f"speed_{n}s",y)])
    return coor1, coor2, coor3
    

def getuvperframe(testdf, iditem):
#     testdf = effDF[effDF['frame_id'] == frameid]
    U = []
    V = []
    groupid = []
    for i, group in tqdm(testdf.groupby([iditem])['track_id']):

    # generate all combinations without replacement 
    # from the group of similar column pairs
        for u, v in itertools.combinations(group, 2):
            U.append(u)
            V.append(v)
            groupid.append(i)
            
    dfframe = pd.DataFrame({'u':U,
             'v':V,
              iditem: groupid
             })

    return dfframe


# use girvan_newman for community detection first
# df_plot = newlinks.reset_index(drop=True)

def most_central_edge(G):
    centrality = betweenness(G, weight="weight")
    return max(centrality, key=centrality.get)


def getcommunity(df_links):
    G_plot = nx.Graph()
    for link in tqdm(df_links.index):
        G_plot.add_edge(df_links.iloc[link]['u'],
                    df_links.iloc[link]['v'],
                    weight = df_links.iloc[link]['weight'])

    communities = girvan_newman(G_plot, most_valuable_edge = most_central_edge)
    # tuple(sorted(c) for c in next(comp))

    node_groups = []
    for com in next(communities):
        node_groups.append(list(com))
    node_groupslen = [len(group) for group in node_groups]
    communitydf = pd.DataFrame({
    'communityID':np.arange(0, len(node_groupslen)),
    'nodegroup':node_groups,
    'groupsize':node_groupslen
    })
    return communitydf

def get_confirm_group(communitydf, DB, example_com):
    """This function only identify the exactly same group, disregarding groups appear across frames.
    for example, track 1, 2 in frame 3-8, track 1,2,3 in frame 7-9
    """
    tracks = communitydf[communitydf["communityID"]==example_com]["nodegroup"].values[0]
    tracks = [int(x) for x in tracks]
    tracks.sort()
    tracksls = "_".join([str(x) for x in tracks])

    # for each frame and social id, get a list of tracks
    sel = DB[(DB["communityID"]==example_com)&(DB["track_id"].isin(tracks))&(DB["Social"]!=-1)]
    frame_summary = sel.groupby(["frame_id", "Social"])["track_id"].unique().reset_index()
    frame_summary["track_id_str"] = frame_summary["track_id"].apply(lambda x: "_".join([str(i) for i in x]))
    confirmed = frame_summary[frame_summary["track_id_str"]==tracksls]
    confirmed["communityID"] = example_com
    return confirmed


def get_confirm_group_loose(communitydf, DB, example_com):
    tracks = communitydf[communitydf["communityID"]==example_com]["nodegroup"].values[0]
    # tracks = [int(x) for x in tracks]
    tracks.sort()
    tracksls = "_".join([str(x) for x in tracks])

    # for each frame and social id, get a list of tracks
    sel = DB[(DB["communityID"]==example_com)&(DB["track_id"].isin(tracks))&(DB["Social"]!=-1)]
    frame_summary = sel.groupby(["frame_id", "Social"])["track_id"].unique().reset_index()
    frame_summary["track_id_str"] = frame_summary["track_id"].apply(lambda x: "_".join([str(i) for i in x]))
    frame_summary["len"] = frame_summary["track_id"].apply(lambda x: len(x))
    def get_inter(temp):
        lst2 = temp["track_id_str"].values[0].split("_")
        lst2 = [x for x in temp["track_id_str"].values[0].split("_")]
        intersection = [value for value in lst2 if value in tracks]
        temp["track_ls_intersection"] = "_".join([str(x) for x in intersection])
        return temp

    reconstruct = frame_summary[frame_summary["len"]>1].groupby("track_id_str").apply(get_inter)
    reconstruct["communityID"] = example_com
    return reconstruct


def get_group(communitydf, DB):
    group_df = []
    for comID in tqdm(communitydf["communityID"].unique()):
        temp = get_confirm_group(communitydf, DB,comID)
        group_df.append(temp)

    group_df = pd.concat(group_df).reset_index(drop = True)
    return group_df

def get_group_loose(communitydf, DB):
    group_df = []
    for comID in tqdm(communitydf["communityID"].unique()):
        temp = get_confirm_group_loose(communitydf, DB, comID)
        group_df.append(temp)

    group_df = pd.concat(group_df).reset_index(drop = True)
    return group_df

def get_gender_comp(genderls):
    if len(genderls)==2:
        return "Mixed"
    else:
        return genderls[0]
    
# load prediction trace
def get_pred(video_name):
    pred_path = video_path[video_path['video_id'] == video_name]['pred_path'].values[0]

    # ref = get_ref(ref_path)
    trace = pd.read_csv(pred_path, sep = '\t', header = None)
    trace.columns = [ "x1", "y1", "x2", "y2", "track_id", "frame_id"]
    trace['w'] = trace['x2'] - trace['x1']
    trace['h'] = trace['y2'] - trace['y1']
    trace['ratio'] = trace['w']/trace['h']
    # trace.rename(columns = {"x1":"bbox0", "y1":"bbox1", "w":"bbox2", "h":"bbox3"}, inplace = True)
    return trace

def compute_color_for_labels(label):
    """
    Simple function that adds fixed color depending on the class
    """
    palette = (2 ** 11 - 1, 2 ** 15 - 1, 2 ** 20 - 1)
    color = [int((p * (label ** 2 - label + 1)) % 255) for p in palette]
    return tuple(color)

# 1. Load data
Current data already contains speed vector

In [None]:
import glob
# resultfolder_root = "../../_data/05_tracking_result_projected/step0_attr_prj/"
staginging_folder = "../../_data/05_tracking_result_projected/step1_speed_vector/historical"
finished = glob.glob(staginging_folder + "/*_full.csv")
finished

In [None]:
sample_loc = "Bryant Park"
filepath = os.path.join(staginging_folder, sample_loc+ "_full.csv")
traceGDF = pd.read_csv(filepath)

# note that across video names there are same track_id, so we need to add video name as prefix
obs = traceGDF[['track_id','video_id']].drop_duplicates().shape[0]
obs_wspeed = traceGDF[['track_id','video_id','speed_0.5s']].dropna(subset=['speed_0.5s']).drop_duplicates(['track_id','video_id']).shape[0]
print(f"Total number of observation in video {sample_loc}: ", obs, "with speed vector: ", obs_wspeed)

# drop the frame_id columns and rename frame_id_new to frame_id
traceGDF.rename(columns = {'frame_id':'frame_id_original'}, inplace = True)
traceGDF.rename(columns = {'frame_id_new':'frame_id'}, inplace = True)

interval = 48 # this represents real world 1 second before sampling. The enlarged video has 480 per second. The video is timeplased 10 times. Therefore 48 frames per real-world second
traceGDF['appear_sec'] = traceGDF.groupby(['track_id','video_id'])['frame_id'].transform('count')/interval


In [None]:
originsize = traceGDF.shape[0]
# # dropping people staying less than 3 seconds, for bryant park, we cannot do this given many people only appeared one time.
# # set the threshold to be 0.5 second
thred = 0.2*48
traceGDF['individual_frame_total'] = traceGDF.groupby(['track_id','video_id'])['frame_id'].transform('nunique')
keepGDF = traceGDF[traceGDF['individual_frame_total']>thred].reset_index(drop = True)
keepsize = keepGDF.shape[0]
per = keepsize/originsize
print("KEEP {}% of the data".format(per*100))


In [None]:
# # this file is currently too large, filter to every 2 frames
thre = 1
fps = 479.97
fps_adjust = fps/thre
traceGDF = keepGDF[keepGDF['frame_id']%thre==0].reset_index(drop = True) # this equals to change the fps to half
# traceGDF.shape

# 3. Create Social Clusters at spatial level
One video_id per calculation to save time

In [None]:
vizdatafolder ="../../_data/05_demo/03_group_behavior_sample"

In [None]:
videols = traceGDF['video_id'].unique()

In [None]:
epsg = 3857
i = 0
tracecurrent = traceGDF[traceGDF['video_id']==videols[i]].reset_index(drop = True)
DBSocial, DBcluster =  generate_social(tracecurrent, epsg, dis = 1.9)

In [None]:
# visualize the spatial group on the ground and image space
num_group_frame = DBSocial.groupby(['frame_id', 'frame_id_original'])['group_id_social']\
    .nunique().reset_index().sort_values('group_id_social', ascending = False).reset_index(drop = True)
framsel = num_group_frame.at[0, 'frame_id_original']
print("Selected frame: ", framsel)

In [None]:
# visualize the spatial group on the ground and image space
num_group_frame = DBSocial.groupby(['frame_id', 'frame_id_original'])['group_id_social']\
    .nunique().reset_index().sort_values('group_id_social', ascending = False).reset_index(drop = True)
framsel = num_group_frame.at[0, 'frame_id_original']
print("Selected frame: ", framsel)
# check frame_id and its immediate next frame_id
def get_selfile(framsel, thre=2):

    seldb = DBcluster[DBcluster['frame_id_original']==framsel].reset_index(drop = True)
    seldb = gpd.GeoDataFrame(seldb, geometry=[Point(x,y) for x,y in zip(seldb['lon'], seldb['lat'])], crs = f"EPSG:4326")
    ax = seldb[seldb['Social']!=-1].plot(column = 'Social', legend = True, cmap = 'tab20', figsize = (8,8))
    seldb[seldb['Social']==-1].plot(color = 'grey', ax = ax)
    
    seldb_shift = DBcluster[DBcluster['frame_id_original']==framsel+thre*2].reset_index(drop = True)
    seldb_shift = gpd.GeoDataFrame(seldb_shift, geometry=[Point(x,y) for x,y in zip(seldb_shift['lon'], 
                                                                                    seldb_shift['lat'])], 
                                   crs = f"EPSG:4326")
    ax = seldb_shift[seldb_shift['Social']!=-1].plot(column = 'Social', legend = True, cmap = 'tab20', figsize = (8,8))
    seldb_shift[seldb_shift['Social']==-1].plot(color = 'grey', ax = ax)
    return seldb, seldb_shift
seldb, seldb_shift = get_selfile(framsel)

In [None]:
seldb.to_file(os.path.join(vizdatafolder, f"{sample_loc}_frame{framsel}_social.geojson"), driver='GeoJSON')
seldb_shift.to_file(os.path.join(vizdatafolder, f"{sample_loc}_frame{framsel+thre*2}_social.geojson"), driver='GeoJSON')

In [None]:
# plot this frame on the image
import cv2
i = 0
video_name = videols[i]
video_file = video_path[video_path['video_id']==video_name]['video_path'].values[0]
# load the original trace file
trace = get_pred(video_name)



In [None]:
def get_data_for_plot(trace, seldb, seldb_shift):
    data = trace.merge(seldb[["track_id_backup", "frame_id_original", 'Social','track_id']], 
                                               left_on = ["track_id", "frame_id"],
                                               right_on = ["track_id_backup", "frame_id_original"],
                                               suffixes=('', '_new'))
    data = data[data['Social']!=-1].reset_index(drop = True)
    data_shift = trace.merge(seldb_shift[["track_id_backup", "frame_id_original", 'Social','track_id']], 
                                               left_on = ["track_id", "frame_id"],
                                               right_on = ["track_id_backup", "frame_id_original"],
                                               suffixes=('', '_new'))
    data_shift = data_shift[data_shift['Social']!=-1].reset_index(drop = True)
    return data, data_shift

def plot_img(video_name, framsel, data):
    video_file = video_path[video_path['video_id']==video_name]['video_path'].values[0]
    
    video, fps, size, length= getbasics(video_file)
        # set video to the frame
    video.set(cv2.CAP_PROP_POS_FRAMES, framsel)
    re, frame = video.read()
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    for j in range(data.shape[0]):
        social_id = data.at[j, 'Social']
        color = compute_color_for_labels(social_id)
        cv2.rectangle(frame,
                        (data.at[j,'x1'], 
                    data.at[j,'y1']), 
                    (data.at[j,'x2'], 
                    data.at[j,'y2']), 
                        color, 2)

    fig = plt.subplots(figsize = (10,10))
    plt.imshow(frame)
    plt.axis('off')
    
data, data_shift = get_data_for_plot(trace, seldb, seldb_shift)
plot_img(video_name, framsel, data)
plot_img(video_name, framsel+10, data_shift)

# 4. Create Graph Community
## 4.1 Count how many links between two pedestrian

In [None]:
linkfolder = "../../_data/05_tracking_result_projected/step2_graphs"

In [None]:
# two person appear together for at least 1 second (60*2 frame per second )
# or half of the appearing time
selp = 'social'
# selp = 'personal_far'
iditem = 'group_id_{}'.format(selp)
df_links = getuvperframe(DBSocial, iditem)

# First Aggregation, disregard time continuity, only consider frequency
df_links = df_links.groupby(['u', 'v']).size().reset_index().rename(columns={0: 'weight'})
# df_links['valid'] = df_links.apply(lambda x: valid_link(DBSocial, x['u'], x['v']), 
#                                    axis = 1)
df_links['coor_ls'] = df_links.apply(lambda x: valid_link_corr(DBSocial, x['u'], x['v'], n = 0.5),axis = 1)

In [None]:
# 1 video second 480 frames
# 1 video second is 10 second in real life
# so 1 real life second is 48 frames
# since we only sampled half of the frames
# then 1 real life second is 24 links
df_links['coor_x'] = df_links['coor_ls'].apply(lambda x: x[0])
df_links['coor_y'] = df_links['coor_ls'].apply(lambda x: x[1])
df_links['coor_speed'] = df_links['coor_ls'].apply(lambda x: x[2])
df_links['valid'] = np.where((df_links['coor_x']>0)&(df_links['coor_y']>0), True, False)
df_links.groupby('valid').size()

In [None]:
df_links[['u', 'v', 'weight', 'coor_x', 'coor_y', 'coor_speed',
       'valid']].to_csv(os.path.join(linkfolder, videols[i] + "_links.csv"), index = False)

In [None]:

# df_links = df_links[df_links['weight'] > timethred*fps].reset_index(drop = True)
# at least 0.5 second stay together
arbithred = 18 # observe the link frequency distribution, 2 is a good threshold

validcondition1 = (df_links['valid']==True)
validcondition2 = (df_links['weight'] > arbithred)
df_links_valid = df_links[validcondition2].reset_index(drop = True)
df_links_valid['u_v'] = df_links_valid.apply(lambda x: "&&".join([str(x['u']), str(x['v'])]), axis = 1)
df_links_valid['v_u'] = df_links_valid.apply(lambda x: "&&".join([str(x['v']), str(x['u'])]), axis = 1)

# =========================================================================================
# DISREGARDING THE COMMUNITY STEP GIVEN THE DATA SPARSITY
# communitydf = getcommunity(df_links_valid)

# flatdf = communitydf.explode('nodegroup')
# flatdf.rename(columns = {'nodegroup':'track_id'}, inplace = True)

# DB = DBcluster.merge(flatdf, on = 'track_id')

In [None]:
df_links_valid # disregarding time

## 4.2 Replot the valid groups

In [None]:
# DEBUGGING # here only considers two people in a group. In reality there are large groups
arbithred = 18 # observe the link frequency distribution, 2 is a good threshold

validcondition1 = (df_links['valid']==True)
validcondition2 = (df_links['weight'] > arbithred)
df_links_valid = df_links[validcondition2].reset_index(drop = True)
df_links_valid['u_v'] = df_links_valid.apply(lambda x: "&&".join([str(x['u']), str(x['v'])]), axis = 1)
df_links_valid['v_u'] = df_links_valid.apply(lambda x: "&&".join([str(x['v']), str(x['u'])]), axis = 1)

In [None]:
# continue varification
def get_demolink(data):
    """HERE THE DATA ONLY CONTAIN ONE FRAME."""
    data_link = data.groupby('Social')['track_id_new'].unique().reset_index()
    data_link['u_v'] = data_link.apply(lambda x: "&&".join([str(i) for i in x['track_id_new']]), axis = 1)
    demolinks = data_link[(data_link['u_v'].isin(df_links_valid['u_v'].unique()))|(data_link['u_v'].isin(df_links_valid['v_u'].unique()))
        ].reset_index(drop = True)
    demolinks = demolinks[['Social', 'track_id_new']].explode('track_id_new').reset_index(drop = True)
    demolinks['social_track'] = demolinks['Social'].astype(str)+"$$"+demolinks['track_id_new'].astype(str)
    return demolinks


demolinks = get_demolink(data)
demolinks_shift = get_demolink(data_shift)

In [None]:
seldb['social_track'] = seldb['Social'].astype(str)+"$$"+seldb['track_id'].astype(str)
seldb_shift['social_track'] = seldb_shift['Social'].astype(str)+"$$"+seldb_shift['track_id'].astype(str)
updateseldb = seldb[seldb['social_track'].isin(demolinks['social_track'].unique())]
updateseldb_shift = seldb_shift[seldb_shift['social_track'].isin(demolinks_shift['social_track'].unique())]

In [None]:
data_update, data_shift_update = get_data_for_plot(trace, updateseldb, updateseldb_shift)
plot_img(video_name, framsel, data_update)
plot_img(video_name, framsel+thre*2, data_shift_update)

## 4.3 Continue find all valid groups

In [None]:
data_link = DBSocial.groupby(['frame_id', 'frame_id_original', 'Social'])['track_id'].unique().reset_index()
data_link['group_member'] = data_link.apply(lambda x: "&&".join([str(i) for i in x['track_id']]), axis = 1)

#measure group length
data_link['group_len'] = data_link['track_id'].apply(lambda x: len(x))


In [None]:
# For each group_member, get a list of combination of 2 people
testgroupmember = data_link[data_link['group_len']>2].sort_values('group_len', ascending = False).head(10)['u_v'].values[0]
testgroupmember = testgroupmember.split("&&")

# make a list of all possible combination of 2 people
data_link['combination2'] = data_link['track_id'].apply(lambda x: list(combinations(x, 2)))
data_link_explode = data_link[['frame_id', 'frame_id_original', 'Social',
       'combination2']].explode('combination2').reset_index(drop = True)
data_link_explode['u_v'] = data_link_explode['combination2'].apply(lambda x: "&&".join([str(i) for i in x]))
data_link_explode['v_u'] = data_link_explode['combination2'].apply(lambda x: "&&".join([str(i) for i in x[::-1]]))


In [None]:
# check if the u_v combination exist in the valid link dataframe
demolinks = data_link_explode[(data_link_explode['u_v'].isin(df_links_valid['u_v'].unique()))|(data_link_explode['u_v'].isin(df_links_valid['v_u'].unique()))
    ].reset_index(drop = True)
demolinks.head()


In [None]:
# convert the u_v combination to group id
# noted the u_v and v_u are the same group
# allgroups = demolinks['u_v'].unique()
# #assign group id to each group
# groupdict = {}
# for i, group in enumerate(allgroups):
#     group_reverse = "&&".join(group.split("&&")[::-1])
#     if (group not in groupdict.keys())&(group_reverse not in groupdict.keys()):
#         groupdict[group] = i
#         groupdict[group_reverse] = i
# len(groupdict.keys())

# demolinks['group_id'] = demolinks['u_v'].apply(lambda x: groupdict[x])


In [None]:
# flatten the groups to each track
demolinks = demolinks[['frame_id', 
                       'frame_id_original', 
                       'Social', 'combination2']].explode('combination2')\
.reset_index(drop = True).sort_values(['frame_id', 'Social'], ascending=False).rename(columns = {'combination2':'track_id'})\
    .drop_duplicates(['frame_id', 'Social', 'track_id']).reset_index(drop = True)
demolinks['frame_social_track'] = demolinks['frame_id'].astype(str)+"$$"+\
    demolinks['Social'].astype(str)+"$$"+demolinks['track_id'].astype(str)
demolinks.shape

In [None]:
# confirm the data can be merged back to the original data
DBSocial['frame_social_track'] = DBSocial['frame_id'].astype(str)+ "$$"\
    + DBSocial['Social'].astype(str)+"$$"\
        +DBSocial['track_id'].astype(str)
DBSocial_update = DBSocial[DBSocial['frame_social_track'].isin(demolinks['frame_social_track'].unique())]\
    .reset_index(drop = True)
DBSocial_update.shape

In [None]:
DBSocial_update.shape[0]/DBcluster.shape[0] # 26% observation ever in a group

In [None]:
# Social cluster id become the group id within each frame
DBSocial_update['group_id_social'] = DBSocial_update['frame_id'].astype(str) + '_' +DBSocial_update['Social'].astype(str)
DBSocial_update['group_size'] = DBSocial_update.groupby(['frame_id', 'Social'])['track_id'].transform('nunique')
DBSocial_update.head()

In [None]:
DBSocial_update.groupby('group_size')['speed_0.5s'].describe()

In [None]:
DBcluster.drop('group_id_social', axis = 1, inplace = True)

In [None]:
DBcluster['frame_social_track'] = DBcluster['frame_id'].astype(str)+ "$$"\
    +DBcluster['Social'].astype(str)+"$$"\
        +DBcluster['track_id'].astype(str)
DBcluster_update = DBcluster.merge(DBSocial_update[['frame_social_track', 'group_id_social', 'group_size']],
                                   on = 'frame_social_track', how = 'left')
# merge the DBSocial_update back to the DBcluster
DBcluster_update['is_group'] = np.where(DBcluster_update['group_id_social'].isnull(), False, True)

In [None]:
# check if a group is newly formed or not
# for each track, find its first frame_id when it is in a group
DBcluster_update['group_first_frame'] = DBcluster_update.groupby(['track_id','is_group'])['frame_id'].transform('min')
DBcluster_update['group_first_frame'] = np.where(DBcluster_update['is_group']==False, np.nan, DBcluster_update['group_first_frame'])
DBcluster_update['track_first_frame'] = DBcluster_update.groupby(['track_id'])['frame_id'].transform('min')
DBcluster_update['group_track_delta'] = DBcluster_update['group_first_frame'] - DBcluster_update['track_first_frame']
DBcluster_update['group_track_delta'].describe()

In [None]:
DBcluster_update['emerging_group'] = np.where(DBcluster_update['group_track_delta']>48, True, False) # 1 second
DBcluster_update['emerging_group'].sum()

In [None]:
# DBcluster_update.drop('is_new_group', axis =1, inplace = True)

In [None]:
DBcluster_update.columns

In [None]:
DBcluster_update['group_size'] = DBcluster_update['group_size'].fillna(1)

In [None]:
DBcluster_update.groupby('group_size')['track_id'].nunique()

In [None]:
# speed tests
fig, ax = plt.subplots(figsize = (5,5))
sns.pointplot(
    data = DBcluster_update[DBcluster_update['group_size']<6],
    x = "group_size",
    y = "speed_0.5s",
    ci = 98,
    errwidth = 1,
    capsize=.2,
    color = "#3bc0cf"
)
ax.set_ylabel("Moving Speed (m/s)")
ax.set_xlabel("Group Size")

sns.despine()

ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')

In [None]:
# speed tests
fig, ax = plt.subplots(figsize = (5,5))
sns.pointplot(
    data = DBcluster_update,
    x = "is_group",
    y = "speed_0.5s",
    ci = 98,
    errwidth = 1,
    capsize=.2,
    color = "#3bc0cf"
)
ax.set_ylabel("Moving Speed (m/s)")
ax.set_xlabel("In Group or Not")

sns.despine()

ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')

In [None]:
# export the data
exportfolder = '/Users/yuan/Dropbox (MIT)/whyte_CV/_data/10_clean/03_individual/historical'
DBcluster_update.columns

In [None]:
metadata = {
    'order':"video order in one location", 
     'video_location':"video location name", 
     'track_id':"reconstructed track id, unique within each video", 
       'video_id':"video id, unique within each location",
       'lat':"prejected latitude",
       'lon':"prejected longitude",
       'track_id_backup':"original track id from the tracking file", 
       'speed_0.5s':"speed in meter per second",
       'speed_x_0.5s':"speed in meter per second in x direction", 
       'speed_y_0.5s':"speed in meter per second in y direction", 
       'hex_id':"h3 level 15 index", 
       'inside':"inside the comparable area (both historical and current) or not",
        'frame_id':"reconstructed frame_id, across videos in a location, unique within one location", 
        'frame_id_original':"original frame_id from the tracking file", 
        'second_from_start':"calculated second from start based on the frame_id, 48 frames per real second",
       'appear_sec':"total second the track appeared in the video", 
       'individual_frame_total':"total number of frames the track appeared in the video", 
       'Social':"spatial cluster id, unique within each frame", 
       'frame_social_track':"frame_id + Social + track_id",
       'group_id_social':"frame_id + Social, unique within each video",
       'group_size':"number of tracks in the group",
       'is_group':"whether the track is in a group or not",
       'group_first_frame':"first frame_id when the track is in a group",
       'track_first_frame':"first frame_id when the track appear in this video", 
       'group_track_delta':"difference between group_first_frame and track_first_frame", 
       'emerging_group':"whether the group is newly formed or not"
}
DBcluster_update[metadata.keys()].to_csv(os.path.join(exportfolder, sample_loc+"_"+ video_name + "_full.csv"), index = False)

In [None]:
json.dump(metadata, open(os.path.join(exportfolder, "metadata.json"), 'w'))