In [14]:
import os
import pandas
import collections
import re

########################
## Floor Finding Part ##
########################


def specificFloorFinder(dev_data, num_of_s_floor):
    # Extracts and returns specific floors and indices and True value in a form of list
    # only if there are at least 3 specific floors else False
    # Third element True is to distinguish whether used 
    # or not in iteration of deviceComparator
    s_floors = []
    indexer = []
    p = re.compile("^.*-")
    
    for idx, trajs in enumerate(dev_data['traj']):
        # if data is floor-specific data
        if p.findall(trajs):
            # get indexes where specific floor is.
            indexer.append(idx)
            s_floors.append(trajs)
    
    # vals = [dev_data['traj'][i] for i in indexer]
    #print(dev_data['traj'])
    #print(indexer)
        #[10, 19, 22, 23, 27, 33, 38, 39]
    #print(s_floors)
        #['2f-left', '1f-right', '1f-right', '1f-inner', '1f-right', '1f-right', '2f-inner', '2f-left']
    #print(vals)
        #['2f-left', '1f-right', '1f-right', '1f-inner', '1f-right', '1f-right', '2f-inner', '2f-left']
        
    # returns specific floors and indices only if there are at least 3 specific floors else False
    # Third element True is to distinguish whether used or not in iteration of deviceComparator
    return [s_floors, indexer] if len(s_floors)>=num_of_s_floor else False

############################
## Device Extracting Part ##
############################

def deviceExtractor(df_sorted_by_time, num_of_s_floor):
    # Create devices for companion checking using OrderedDict for easier implementation in the future.
    # Bring all trajectory data and extract trajectory tuples which has more than 3 specific floors
    # and return devices after saving data
    devices = collections.OrderedDict()
    
    # Iter through tuples 
    for idx, device_index in enumerate(df_sorted_by_time.index):

        # df.index is sorted by ts and enumerates according to sorted result.
        # idx for # th of tuple and device_index for original index of that tuple
        dev_data = df_sorted_by_time.loc[device_index, ['traj']]
        
        specificFloors = specificFloorFinder(dev_data, num_of_s_floor)
        # If there are at least 3 specific floors
        if(specificFloors):
            # then put specific floors, indices of them and tuple's index inside dictionary
            # if specificFloors is False, trajectory tuple won't be saved
            # because there is less than 3 specific floors.
            devices[df_sorted_by_time['date_device_id'][device_index]] = specificFloors
            devices[df_sorted_by_time['date_device_id'][device_index]].append(device_index)
    
    return devices


############################
## Device Comparing Part ###
############################

def deviceComparator(df_sorted_by_time, devices, num_of_s_floor):
    
    p = re.compile("^.*_")
    # Iterate devices' items to compare devices
    tuples = list(devices.items())
    
    for present_tuple_idx, tuple_value in enumerate(tuples):
        
        # Iterate from next one of present tuple until that day's last tuple
        for next_tuple_idx in range(present_tuple_idx+1, len(tuples)):
            
            # Compare days between present tuple's day and next tuple's day.
            # If days are not identical, then we go out of loop
            # because difference in days means no companions 
            if p.findall(tuples[present_tuple_idx][0])[0][:-1] != p.findall(tuples[next_tuple_idx][0])[0][:-1]:
                break
            
            # Check whether it is identified as two tuples are companion
            if companionChecker(df_sorted_by_time.ix[tuples[present_tuple_idx][1][2]], 
                            df_sorted_by_time.ix[tuples[next_tuple_idx][1][2]],
                            tuple_value[1], 
                            tuples[next_tuple_idx][1],
                            num_of_s_floor) :
                # Append date_device_id to each tuple when they are companion
                df_sorted_by_time.ix[tuples[present_tuple_idx][1][2]]['companion'].append(tuples[next_tuple_idx][0])
                df_sorted_by_time.ix[tuples[next_tuple_idx][1][2]]['companion'].append(tuples[present_tuple_idx][0])
                
    
    return df_sorted_by_time
            
#############################
## Companion Checking Part ##
#############################
    
def companionChecker(pt, nt, pt_s, nt_s, num_of_s_floor):    
    # pt = Present tuple from df
    # tt = Next tuple from df
    # pt_s = Present tuple of 'devices' excluding device id
    # nt_s = Next tuple of 'devices' excluding device id
    # companion Checker finds out accompanied trajectories and
    # return boolean value to show whether two tuples are companion or not
    
    
    # start_idx to save time by not iterating useless parts.
    start_idx = 0
    # number of trajectories accompanied
    accompanied_count = 0
    # Iterate through next tuple's specific floors and find out accompanied moments
    for nt_idx, nt_idx_val in enumerate(nt_s[1]):
        
        for pt_idx in range(start_idx, len(pt_s[1])):
            # When next tuple's traj ts time is later than ts_end time of present tuple
            # then save present index value + 1 for starting
            # and go to next iteration of inner loop
            if pt['ts_end'][pt_s[1][pt_idx]] < nt['ts'][nt_idx_val]:
                start_idx = pt_idx + 1 
                continue
            
            # When next tuple's traj ts_end time is earlier than ts time of present tuple
            # then get out of the loop and start next iteration of outer loop
            if pt['ts'][pt_s[1][pt_idx]] > nt['ts_end'][nt_idx_val]:
                break
            
            # From now there is time overlapping between two tuple traj.
            # Then we can check the names of specific floors
            # and add to accompanied_list
            if pt['traj'][pt_s[1][pt_idx]] == nt['traj'][nt_idx_val]:
                accompanied_count = accompanied_count + 1      
                break
        
        
        if accompanied_count >= num_of_s_floor:
            break   
                
    # Return boolean value to show whether two tuples are companion or not
    return True if accompanied_count >= num_of_s_floor else False

def companionFinder(df, num_of_s_floor):
    print("start")
    df_sorted_by_time = df.sort_values(['ts'], ascending=True) # Sort tuples according to start time of first timestamp
    df_sorted_by_time['companion']= df_sorted_by_time.apply(lambda x: [], axis=1)
    devices = deviceExtractor(df_sorted_by_time, num_of_s_floor)
    print("deviceExtractor finished")
    df_companion = deviceComparator(df_sorted_by_time, devices, num_of_s_floor)
    print("deviceComparator finished")
    df_companion['companion_count'] = df_companion['companion'].apply(lambda x: len(x))
    return df_companion, len(devices)
    


start
deviceExtractor finished
deviceComparator finished


In [41]:
df = pandas.read_pickle("../code/data/786/786_mpframe_trajprocessed.p")
# df_companion, lenn = companionFinder(df.head(1000), 3)
# df_companion.to_pickle("../code/data/786/companion_result.p")

In [16]:
df_companion1, lenn1 = companionFinder(df.head(10000), 1)
df_companion2, lenn2 = companionFinder(df.head(10000), 2)
df_companion3, lenn3 = companionFinder(df.head(10000), 3)
df_companion4, lenn4 = companionFinder(df.head(10000), 4)
df_companion5, lenn5 = companionFinder(df.head(10000), 5)
df_companion6, lenn6 = companionFinder(df.head(10000), 6)

start
deviceExtractor finished
deviceComparator finished
start
deviceExtractor finished
deviceComparator finished
start
deviceExtractor finished
deviceComparator finished
start
deviceExtractor finished
deviceComparator finished
start
deviceExtractor finished
deviceComparator finished
start
deviceExtractor finished
deviceComparator finished


In [17]:
print(lenn1, lenn2, lenn3, lenn4, lenn5, lenn6)

3421 1689 1192 801 643 521


In [38]:
print(lenn1)
df_companion1.companion_count.value_counts().sort_index()

3421


0      7243
1       589
2       518
3       471
4       318
5       187
6       147
7        78
8        53
9        49
10       41
11       29
12       29
13        6
14       19
15       17
16       19
17       10
18        8
19       14
20       10
21        8
22        5
23        5
24        7
25        8
26        2
27        7
28       10
29        3
       ... 
40        1
41        1
42        3
43        3
44        3
45        5
46        1
47        4
48        2
49        2
50        1
54        1
55        2
56        2
57        3
58        1
61        1
62        1
63        3
64        2
65        3
66        1
68        1
70        1
71        1
73        1
80        1
82        1
108       1
119       1
Name: companion_count, dtype: int64

In [33]:
print(lenn2)
df_companion2.companion_count.value_counts().sort_index()[:10]

1689


0    8831
1     468
2     212
3     104
4      78
5      60
6      58
7      44
8      34
9      26
Name: companion_count, dtype: int64

In [34]:
print(lenn3)
df_companion3.companion_count.value_counts().sort_index()[:10]

1192


0    9332
1     272
2     112
3      66
4      59
5      52
6      30
7      27
8      12
9       9
Name: companion_count, dtype: int64

In [45]:
print(lenn4)
print(df_companion4.shape)
df_companion4.companion_count.value_counts().sort_index()[:10]

801
(10000, 12)


0    9604
1     141
2      65
3      51
4      51
5      34
6      26
7      14
8       8
9       2
Name: companion_count, dtype: int64

In [44]:
print(lenn5)
print(df_companion5.shape)
df_companion5.companion_count.value_counts().sort_index()[:10]

643
(10000, 12)


0    9677
1     120
2      64
3      46
4      36
5      29
6      16
7       7
8       3
9       1
Name: companion_count, dtype: int64

In [37]:
print(lenn6)
df_companion6.companion_count.value_counts().sort_index()[:10]

521


0    9729
1     106
2      67
3      36
4      31
5      19
6       6
7       3
8       3
Name: companion_count, dtype: int64

In [40]:
df

Unnamed: 0,date_device_id,logs,traj,ts,dwell_time,hour_start,time_start,ts_end,hour_end,time_end
0,16675_017699a4395352e941f6ed271f5fd1cd,"[4547961, 4546624, 4546613, 4546612]","[out, out, in, 1f]","[1440741504, 1440747786, 1440747820, 1440747820]","[0, 34, 0, 0]","[14, 16, 16, 16]","[14:58:24, 16:43:06, 16:43:40, 16:43:40]","[1440741504, 1440747820, 1440747820, 1440747820]","[14, 16, 16, 16]","[14:58:24, 16:43:40, 16:43:40, 16:43:40]"
1,16675_02614c7588f7f8eaa0d3b9047ac08410,"[4545933, 4545741, 4545737, 4545720, 4545716, ...","[out, in, 1f, 1f-right, 2f, 2f-left, 2f-right,...","[1440750511, 1440751250, 1440751260, 144075126...","[857, 118, 42, 108, 27, 20, 8, 0, 0]","[17, 17, 17, 17, 17, 17, 17, 17, 17]","[17:28:31, 17:40:50, 17:41:00, 17:41:00, 17:41...","[1440751368, 1440751368, 1440751302, 144075136...","[17, 17, 17, 17, 17, 17, 17, 17, 17]","[17:42:48, 17:42:48, 17:41:42, 17:42:48, 17:42..."
2,16675_028a1f4dbca00ed06814fdda60f1b599,"[4551009, 4550832, 4550777, 4550731, 4550674, ...","[out, in, 2f, 2f-right, 2f-inner, 2f-left, 1f,...","[1440725175, 1440726721, 1440727028, 144072702...","[3548, 1293, 205, 418, 18, 6, 286, 266, 230, 14]","[10, 10, 10, 10, 11, 11, 11, 11, 11, 11]","[10:26:15, 10:52:01, 10:57:08, 10:57:08, 11:00...","[1440728723, 1440728014, 1440727233, 144072744...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11]","[11:25:23, 11:13:34, 11:00:33, 11:04:06, 11:00..."
3,16675_02d65bf10b0914eaa0c0ee68bf3531c6,"[4546419, 4546413, 4546410]","[out, in, 2f, 2f-right]","[1440748578, 1440748597, 1440748598, 1440748598]","[76, 0, 0, 0]","[16, 16, 16, 16]","[16:56:18, 16:56:37, 16:56:38, 16:56:38]","[1440748654, 1440748597, 1440748598, 1440748598]","[16, 16, 16, 16]","[16:57:34, 16:56:37, 16:56:38, 16:56:38]"
4,16675_04e2238789b0de61744fc461d3914a18,"[4549570, 4543625, 4543624, 4543623]","[out, out, in, 1f]","[1440733372, 1440759585, 1440759586, 1440759586]","[0, 0, 0, 0]","[12, 19, 19, 19]","[12:42:52, 19:59:45, 19:59:46, 19:59:46]","[1440733372, 1440759585, 1440759586, 1440759586]","[12, 19, 19, 19]","[12:42:52, 19:59:45, 19:59:46, 19:59:46]"
5,16675_062c73a8b307fd05f6af2472a35671ef,"[4545181, 4545105, 4545104, 4544999]","[out, in, 1f, 1f-right, 1f-inner]","[1440753481, 1440753683, 1440753683, 144075368...","[986, 565, 455, 0, 73]","[18, 18, 18, 18, 18]","[18:18:01, 18:21:23, 18:21:23, 18:21:23, 18:27...","[1440754467, 1440754248, 1440754138, 144075368...","[18, 18, 18, 18, 18]","[18:34:27, 18:30:48, 18:28:58, 18:21:23, 18:28..."
6,16675_062cd5dfea6382b50c7f761c924a607e,"[4548903, 4548902, 4548294]","[out, in, out]","[1440736886, 1440736887, 1440739980]","[196, 0, 0]","[13, 13, 14]","[13:41:26, 13:41:27, 14:33:00]","[1440737082, 1440736887, 1440739980]","[13, 13, 14]","[13:44:42, 13:41:27, 14:33:00]"
7,16675_06c041f68ec481883941d47d99d6903f,"[4550980, 4550828, 4550826, 4550622, 4550540, ...","[out, in, 3f, in, 1f, 1f-right, out]","[1440725586, 1440726737, 1440726743, 144072778...","[2957, 168, 41, 459, 0, 0, 16]","[10, 10, 10, 11, 11, 11, 12]","[10:33:06, 10:52:17, 10:52:23, 11:09:40, 11:17...","[1440728543, 1440726905, 1440726784, 144072823...","[11, 10, 10, 11, 11, 11, 12]","[11:22:23, 10:55:05, 10:53:04, 11:17:19, 11:17..."
8,16675_0a47ed78e921824cf54dddcf49ae2911,"[4545845, 4545470]","[out, in]","[1440750876, 1440752403]","[2179, 0]","[17, 18]","[17:34:36, 18:00:03]","[1440753055, 1440752403]","[18, 18]","[18:10:55, 18:00:03]"
9,16675_0b4ee3e2291a86b7360a60f47bac6736,"[4548960, 4548959, 4548958]","[out, in, 2f]","[1440736661, 1440736662, 1440736662]","[0, 0, 0]","[13, 13, 13]","[13:37:41, 13:37:42, 13:37:42]","[1440736661, 1440736662, 1440736662]","[13, 13, 13]","[13:37:41, 13:37:42, 13:37:42]"


In [51]:
df = pandas.read_pickle("../code/data/786/786_mpframe2.p")
visitcounts = df.groupby(['device_id'])['new_visit_count'].max()
freqvisitors = visitcounts.loc[visitcounts > 10 ].keys()
trajs_freqremoved = df.loc[-df.device_id.isin(freqvisitors.tolist())]

In [54]:
df.shape

(98993, 14)

In [48]:
aa = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [49]:
min(len(aa), 30)

10