In [1]:
%matplotlib inline

import os, sys, time
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
def print_progress(cnt, total):
    """Display a progress bar"""
    assert(cnt > 0 and total > 0 and cnt <= total)
    length = 80
    ratio = cnt / total
    n = int(length * ratio)
    sys.stdout.write('\r[%-80s] %d%%' % ('-'*n, int(ratio*100)))
    sys.stdout.flush()

In [3]:
data_dir = '../data'
fvisits = os.path.join(data_dir, 'userVisits-Melb-1.csv')
fpoi = os.path.join(data_dir, 'poi-Melb-all.csv')
fpoi_new = os.path.join(data_dir, 'poi-Melb-1.csv')
fphoto = os.path.join(data_dir, 'Melb_photos_bigbox.csv')
ftraj_all = os.path.join(data_dir, 'traj-all-Melb-1.csv')
ftraj_noshort = os.path.join(data_dir, 'traj-noshort-Melb.csv')
ftraj_nofew = os.path.join(data_dir, 'traj-nofew-Melb.csv')

fvisits2 = os.path.join(data_dir, 'userVisits-Melb-allPOI.csv')
fpoi2 = os.path.join(data_dir, 'costProfCat-MelbPOI-all.csv')

<a id='sec1'></a>

In [4]:
# 1. Load POI Data
poi_df = pd.read_csv(fpoi)
poi_df.head()

Unnamed: 0,poiID,poiName,poiTheme,poiLat,poiLon,poiURL
0,0,Arts Precinct,City precincts,-37.82167,144.96778,https://en.wikipedia.org/wiki/Melbourne_Arts_P...
1,1,Docklands,City precincts,-37.817,144.946,"https://en.wikipedia.org/wiki/Docklands,_Victoria"
2,2,Government Precinct,City precincts,-37.8119,144.973,"https://en.wikipedia.org/wiki/Spring_Street,_M..."
3,3,Little Italy,City precincts,-37.79972,144.96694,"https://en.wikipedia.org/wiki/Little_Italy,_Me..."
4,4,RMIT City,City precincts,-37.80778,144.96333,https://en.wikipedia.org/wiki/City_campus_of_t...


In [5]:
poi_df.drop(['poiURL', 'poiName'], axis=1, inplace=True)

In [6]:
poi_df.set_index('poiID', inplace=True)

In [7]:
print('#POIs:', poi_df.shape[0])

#POIs: 88


In [8]:
len(poi_df['poiTheme'].unique())

9

<a id='sec2'></a>

In [9]:
# 2. Load Photo Data
photo_df = pd.read_csv(fphoto, skipinitialspace=True, parse_dates=[2])
photo_df.head()

Unnamed: 0,Photo_ID,User_ID,Timestamp,Longitude,Latitude,Accuracy,URL,Marker(photo=0 video=1)
0,5703013770,25287507@N02,2011-05-09 19:19:58,144.604775,-37.878579,16,http://www.flickr.com/photos/25287507@N02/5703...,0
1,5653121597,59335517@N02,2011-04-10 13:27:37,145.033779,-37.82231,16,http://www.flickr.com/photos/59335517@N02/5653...,0
2,5522325184,26303188@N00,2011-03-13 20:44:24,144.981122,-37.824344,14,http://www.flickr.com/photos/26303188@N00/5522...,0
3,7978703060,82732068@N02,2012-07-14 12:29:43,145.947854,-38.479344,15,http://www.flickr.com/photos/82732068@N02/7978...,0
4,174030514,19677632@N00,2004-08-01 19:28:40,145.533485,-37.949003,12,http://www.flickr.com/photos/19677632@N00/1740...,0


In [10]:
#removing photos with low accuracies (accuracy $< 16$).

print(photo_df['Accuracy'].unique())
photo_df = photo_df[photo_df['Accuracy'] == 16]
print(photo_df['Accuracy'].unique())

[16 14 15 12 11 13  8 10  9  3  5  7  6  4  1  2]
[16]


In [11]:
#Remove columns that will not be used.
photo_df.drop(['Accuracy', 'URL', 'Marker(photo=0 video=1)'], axis=1, inplace=True)

In [13]:
#Convert datatime to unix epoch.

photo_df['dateTaken'] = photo_df['Timestamp'].apply(lambda x: x.timestamp())
photo_df.drop('Timestamp', axis=1, inplace=True)
photo_df['dateTaken'] = photo_df['dateTaken'].astype(np.int)

photo_df.rename(columns={'Photo_ID':'photoID', 'User_ID':'userID', 'Longitude':'photoLon', 'Latitude':'photoLat'}, \
                inplace=True)
photo_df.head()

Unnamed: 0,photoID,userID,photoLon,photoLat,dateTaken
0,5703013770,25287507@N02,144.604775,-37.878579,1304932798
1,5653121597,59335517@N02,145.033779,-37.82231,1302406057
5,9588963220,67774014@N00,144.96506,-37.815725,1377408461
9,6191232325,63488421@N08,144.666981,-37.922733,1316741616
10,6644759687,10559879@N00,144.961177,-37.812759,1325813367


In [14]:
photo_df.shape

(94142, 5)

In [15]:
print('#Photos:', photo_df['photoID'].unique().shape[0])
print('#Users:', photo_df['userID'].unique().shape[0])

#Photos: 94142
#Users: 1659


In [16]:
photo_df.set_index('photoID', inplace=True)
photo_df['poiID'] = -1
photo_df['trajID'] = -1
photo_df.head()

Unnamed: 0_level_0,userID,photoLon,photoLat,dateTaken,poiID,trajID
photoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5703013770,25287507@N02,144.604775,-37.878579,1304932798,-1,-1
5653121597,59335517@N02,145.033779,-37.82231,1302406057,-1,-1
9588963220,67774014@N00,144.96506,-37.815725,1377408461,-1,-1
6191232325,63488421@N08,144.666981,-37.922733,1316741616,-1,-1
6644759687,10559879@N00,144.961177,-37.812759,1325813367,-1,-1


<a id='sec3'></a>

In [17]:
# 3. Map Photos to POIs & Build Trajectories

def calc_dist_vec(longitudes1, latitudes1, longitudes2, latitudes2):
    #calculate the distance (unit: km) between two places on earth, vectorised
    # convert degrees to radians
    lng1 = np.radians(longitudes1)
    lat1 = np.radians(latitudes1)
    lng2 = np.radians(longitudes2)
    lat2 = np.radians(latitudes2)
    radius = 6371.0088 # mean earth radius

    dlng = np.fabs(lng1 - lng2)
    dlat = np.fabs(lat1 - lat2)
    dist =  2 * radius * np.arcsin( np.sqrt( 
                (np.sin(0.5*dlat))**2 + np.cos(lat1) * np.cos(lat2) * (np.sin(0.5*dlng))**2 ))
    return dist

In [18]:
calc_dist_vec(poi_df.loc[0, 'poiLon'], poi_df.loc[0, 'poiLat'], poi_df.loc[0, 'poiLon'], poi_df.loc[0, 'poiLat'])

0.0

In [19]:
SUPER_FAST = 150 / (60 * 60)  # 150 km/h

In [20]:
filter_tags = pd.Series(data=np.zeros(photo_df.shape[0], dtype=np.bool), index=photo_df.index)

In [21]:
cnt = 0
total = photo_df['userID'].unique().shape[0]
for user in sorted(photo_df['userID'].unique().tolist()):
    udf = photo_df[photo_df['userID'] == user].copy()
    udf.sort_values(by='dateTaken', ascending=True, inplace=True)
    udists = calc_dist_vec(udf['photoLon'][:-1].values, udf['photoLat'][:-1].values, \
                           udf['photoLon'][1: ].values, udf['photoLat'][1: ].values)
    assert(udists.shape[0] == udf.shape[0]-1)
    superfast = np.zeros(udf.shape[0]-1, dtype=np.bool)
    for i in range(udf.shape[0]-1):
        ix1 = udf.index[i]
        ix2 = udf.index[i+1]
        dtime = udf.loc[ix2, 'dateTaken'] - udf.loc[ix1, 'dateTaken']
        assert(dtime >= 0)
        if dtime == 0: superfast[i] = True
        speed = udists[i] / dtime
        if speed > SUPER_FAST: superfast[i] = True
    for j in range(superfast.shape[0]-1):
        if superfast[j] and superfast[j+1]:  # jx0-->SUPER_FAST-->jx-->SUPER_FAST-->jx1: remove photo jx
            jx = udf.index[j+1]
            filter_tags.loc[jx] = True
    cnt += 1; print_progress(cnt, total)

[--------------------------------------------------------------------------------] 100%

In [22]:
for jx in filter_tags.index:
    if filter_tags.loc[jx] == True:
        photo_df.drop(jx, axis=0, inplace=True)

In [23]:
photo_df.shape

(92758, 6)

In [24]:
#distance between POIs.

poi_distmat = pd.DataFrame(data=np.zeros((poi_df.shape[0], poi_df.shape[0]), dtype=np.float), \
                           index=poi_df.index, columns=poi_df.index)

for ix in poi_df.index:
    poi_distmat.loc[ix] = calc_dist_vec(poi_df.loc[ix, 'poiLon'], poi_df.loc[ix, 'poiLat'], \
                                        poi_df['poiLon'], poi_df['poiLat'])

In [26]:

photo_poi_distmat = pd.DataFrame(data=np.zeros((photo_df.shape[0], poi_df.shape[0]), dtype=np.float), \
                                 index=photo_df.index, columns=poi_df.index)

In [27]:
for i in range(photo_df.shape[0]):
    ix = photo_df.index[i]
    photo_poi_distmat.loc[ix] = calc_dist_vec(photo_df.loc[ix, 'photoLon'], photo_df.loc[ix, 'photoLat'], \
                                              poi_df['poiLon'], poi_df['poiLat'])
    print_progress(i+1, photo_df.shape[0])

[--------------------------------------------------------------------------------] 100%

In [28]:
DIST_MAX = 0.2  # 0.2km

In [29]:
TIME_GAP = 8 * 60 * 60  # 8 hours

In [30]:
users = sorted(photo_df['userID'].unique().tolist())

<a id='sec3.1'></a>

In [31]:
#Map Photos to POIs: Greedy Approach
traj_greedy = photo_df.copy()

In [32]:
cnt = 0
for ix in traj_greedy.index:
    min_ix = photo_poi_distmat.loc[ix].idxmin()
    if photo_poi_distmat.loc[ix, min_ix] > DIST_MAX:  # photo is taken at position far from any POI
        pass
    else:
        traj_greedy.loc[ix, 'poiID'] = poi_df.index[min_ix]  
        # all POIs that are very close to a photo are an option to map
        #photo_df.loc[ix, 'poiID'] = str(poi_df.index[~(dists > dist_max)].tolist())
    cnt += 1; print_progress(cnt, traj_greedy.shape[0])

[--------------------------------------------------------------------------------] 100%

In [33]:
traj_greedy = traj_greedy[traj_greedy['poiID'] != -1]

In [34]:
tid = 0
cnt = 0
for user in users:
    udf = traj_greedy[traj_greedy['userID'] == user].copy()
    udf.sort_values(by='dateTaken', ascending=True, inplace=True)
    if udf.shape[0] == 0: 
        cnt += 1; print_progress(cnt, len(users))
        continue
    
    traj_greedy.loc[udf.index[0], 'trajID'] = tid
    for i in range(1, udf.shape[0]):
        ix1 = udf.index[i-1]
        ix2 = udf.index[i]
        if udf.loc[ix2, 'dateTaken'] - udf.loc[ix1, 'dateTaken'] > TIME_GAP:
            tid += 1
            traj_greedy.loc[ix2, 'trajID'] = tid
        else:
            traj_greedy.loc[ix2, 'trajID'] = tid
    tid += 1  # for trajectories of the next user
    cnt += 1; print_progress(cnt, len(users))

[--------------------------------------------------------------------------------] 100%

<a id='sec3.2'></a>

In [47]:
# 4. Save Trajectory Data
#Save trajectories and related POIs to files.

visits = traj_greedy[traj_greedy['poiID'] != -1]
#visits = traj_dp[traj_dp['poiID'] != -1]

In [48]:
visits.head()

Unnamed: 0_level_0,userID,photoLon,photoLat,dateTaken,poiID,trajID
photoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9588963220,67774014@N00,144.96506,-37.815725,1377408461,8,3333
703949177,79925938@N00,144.993074,-37.84657,1183135485,20,3975
775049707,79925938@N00,144.964085,-37.815018,1183823546,18,3977
8687823797,35558720@N03,144.960168,-37.823471,1364043697,25,1487
2676185,79925938@N00,144.969438,-37.816501,1104369016,71,3919


In [49]:
#save visits data.

uservisits = visits.copy()
uservisits.rename(columns={'trajID':'seqID'}, inplace=True)
uservisits.to_csv(os.path.join(data_dir, 'userVisits-Melb-1.csv'), index=False)

In [50]:
#save POIs to CSV file.
poiix = sorted(visits['poiID'].unique().tolist())

In [51]:
poi_df.loc[poiix].to_csv(fpoi_new, index=True)

In [52]:
#count the number of photos taken at each POI.

poifreq = visits[['poiID', 'dateTaken']].copy().groupby('poiID').agg(np.size)
poifreq.rename(columns={'dateTaken':'poiFreq'}, inplace=True)

In [53]:
#save data in file format like IJCAI datasets: user visits data, POI related data.
visits_df = visits.copy()
visits_df.reset_index(inplace=True)
visits_df.drop(['photoLon', 'photoLat'], axis=1, inplace=True)

visits_df['dateTaken'] = visits_df['dateTaken'].astype(np.int)
visits_df.rename(columns={'trajID':'seqID'}, inplace=True)

visits_df['poiTheme'] = poi_df.loc[visits_df['poiID'], 'poiTheme'].tolist()
visits_df['poiFreq'] = poifreq.loc[visits_df['poiID'], 'poiFreq'].astype(np.int).tolist()

In [55]:
#sort photos by date taken.
visits_df.sort_values(by='dateTaken', ascending=True, inplace=True)

In [54]:
visits_df.head()

Unnamed: 0,photoID,userID,dateTaken,poiID,seqID,poiTheme,poiFreq
0,9588963220,67774014@N00,1377408461,8,3333,Shopping,209
1,703949177,79925938@N00,1183135485,20,3975,Shopping,653
2,775049707,79925938@N00,1183823546,18,3977,Shopping,659
3,8687823797,35558720@N03,1364043697,25,1487,Entertainment,1671
4,2676185,79925938@N00,1104369016,71,3919,Parks and spaces,1693


In [56]:
#save visits data.cols = ['photoID', 'userID', 'dateTaken', 'poiID', 'poiTheme', 'poiFreq', 'seqID']
visits_df.to_csv(fvisits2, sep=';', quoting=2, columns=cols, index=False)

In [57]:
fvisits2

'../data/userVisits-Melb-allPOI.csv'

In [58]:
#POI related data: cost=POI-POI distance (meters), profit=frequency (#photos taken at the second POI).
costprofit_df = pd.DataFrame(columns=['from', 'to', 'cost', 'profit', 'category'])

In [59]:
pois = sorted(visits_df['poiID'].unique())

In [60]:
for poi1 in pois:
    for poi2 in pois:
        if poi1 == poi2: 
            continue
        ix = costprofit_df.shape[0]
        costprofit_df.loc[ix, 'from'] = poi1
        costprofit_df.loc[ix, 'to'] = poi2
        
        costprofit_df.loc[ix, 'cost'] = poi_distmat.loc[poi1, poi2] * 1000  # meters
        
        costprofit_df.loc[ix, 'profit'] = poifreq.loc[poi2, 'poiFreq']
        costprofit_df.loc[ix, 'category'] = poi_df.loc[poi2, 'poiTheme']

In [61]:
costprofit_df.head()

Unnamed: 0,from,to,cost,profit,category
0,0,1,1982.34,139,City precincts
1,0,2,1179.18,304,City precincts
2,0,3,2441.85,138,City precincts
3,0,4,1593.2,349,City precincts
4,0,5,1249.32,70,City precincts


In [62]:
#save POI related data.

cols = ['from', 'to', 'cost', 'profit', 'category']
costprofit_df.to_csv(fpoi2, sep=';', quoting=2, columns=cols, index=False)