In [6]:
"""
Name: Ciaran Cooney
Date: 10/05/2019
Description: Script to apply feature engineering to a dataset containng geolocations anf journey time
information. The task is to predict which IDs will be in the city-centre at a given time.
"""

import pandas as pd
import numpy as np

df = pd.read_csv('data_train/data_train.csv')

In [7]:
def in_or_out(x,y):
    """
    Function for determining whether location is within the city-centre or not.
    """
    if (3750901.5068 <= x <= 3770901.5069) and (-19268905.6133 <= y <= -19208905.6133):
        return 1
    else:
        return 0
    
def journey_time(x,y):
    """
    Function for computing total journey time in seconds
    """
    x = pd.to_datetime(x)
    y = pd.to_datetime(y)
    return (y-x).total_seconds()

In [8]:
df.fillna(0, inplace=True) # replace NANs with zeros
df.drop(["Unnamed: 0","hash"], axis=1, inplace=True)
df.set_index("trajectory_id", inplace=True)

In [16]:
"""
Compute journey time and distance from city centre.
"""
df["j_time"] = list(map(journey_time, df["time_entry"], df["time_exit"]))
df["dist"] = list(map(calculateDistance,df["x_entry"],df["y_entry"]))

In [10]:
df.head()

Unnamed: 0_level_0,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,j_time
trajectory_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
traj_0000a8602cf2def930488dee7cdad104_1_0,07:04:31,07:08:32,0.0,0.0,0.0,3751014.0,-19093980.0,3750326.0,-19136340.0,241.0
traj_0000a8602cf2def930488dee7cdad104_1_1,07:20:34,07:25:42,0.0,0.0,0.0,3743937.0,-19322470.0,3744975.0,-19319660.0,308.0
traj_0000a8602cf2def930488dee7cdad104_1_2,07:53:32,08:03:25,0.0,0.0,0.0,3744868.0,-19293560.0,3744816.0,-19292840.0,593.0
traj_0000a8602cf2def930488dee7cdad104_1_3,08:17:50,08:37:23,0.0,0.0,0.0,3744880.0,-19292290.0,3744809.0,-19290490.0,1173.0
traj_0000a8602cf2def930488dee7cdad104_1_4,14:38:09,14:38:09,0.0,0.0,0.0,3744909.0,-19285580.0,3744909.0,-19285580.0,0.0


In [17]:
"""
Compute the net trajectory and the most recent trajectory
for each of the GPS Ids. Assigned here to each tag but 
only used in the final location data.
"""
def net_trajectory(distances):
    """
    Computes the net trajectory and most recent trajectory in
    relation to centre-point.
    """
    
    traj =[]
    if len(distances) >= 2:
        for i in range(len(distances)-1):
            traj.append(distances[i] - distances[i+1])

        recent_traj = distances[-2] - distances[-1]
        return np.sum(traj), recent_traj
    else:
        return 0.0, 0.0
    

"""
Compute mean journey distance.
"""
def calculateDistance(x1,y1,x2,y2):
    """
    Compute Distance between two points.
    """
    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)  
    return dist

unique_ids = df.index.unique() #extract list of all unique tags

import time
start = time.time()
for i in unique_ids:

    #####trajectories#####
    distances = df[df.index == i]["dist"].values
    net_tr, prev_tr = net_trajectory(distances)
    df.loc[[i],'net_tr'] = net_tr
    df.loc[[i],'prev_tr'] = prev_tr
    
print(f"run time: {time.time()-start} seconds")

df.head()

Unnamed: 0_level_0,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,dist,net_tr,prev_tr,j_time
trajectory_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
traj_0000a8602cf2def930488dee7cdad104_1_0,07:04:31,07:08:32,0.0,0.0,0.0,3751014.0,-19093980.0,3750326.0,-19136340.0,145261.881023,99463.898797,3544.948847,241.0
traj_0000a8602cf2def930488dee7cdad104_1_1,07:20:34,07:25:42,0.0,0.0,0.0,3743937.0,-19322470.0,3744975.0,-19319660.0,85266.551966,99463.898797,3544.948847,308.0
traj_0000a8602cf2def930488dee7cdad104_1_2,07:53:32,08:03:25,0.0,0.0,0.0,3744868.0,-19293560.0,3744816.0,-19292840.0,56962.437581,99463.898797,3544.948847,593.0
traj_0000a8602cf2def930488dee7cdad104_1_3,08:17:50,08:37:23,0.0,0.0,0.0,3744880.0,-19292290.0,3744809.0,-19290490.0,55736.470981,99463.898797,3544.948847,1173.0
traj_0000a8602cf2def930488dee7cdad104_1_4,14:38:09,14:38:09,0.0,0.0,0.0,3744909.0,-19285580.0,3744909.0,-19285580.0,49342.931074,99463.898797,3544.948847,0.0


In [20]:
start = time.time()
for id in group.index:
    
    df.loc[[id],'x_home'] = list(df[df.index == id]["x_entry"])[0]
    df.loc[[id],'y_home'] = list(df[df.index == id]["y_entry"])[0]
print(f"run time: {time.time()-start} seconds")

df["home"] = list(map(in_or_out, df["x_home"], df["y_home"]))

Unnamed: 0_level_0,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,dist,net_tr,prev_tr,j_time,x_home,y_home,home
trajectory_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
traj_0000a8602cf2def930488dee7cdad104_1_0,07:04:31,07:08:32,0.0,0.0,0.0,3751014.0,-19093980.0,3750326.0,-19136340.0,145261.881023,99463.898797,3544.948847,241.0,3751014.0,-19093980.0,0
traj_0000a8602cf2def930488dee7cdad104_1_1,07:20:34,07:25:42,0.0,0.0,0.0,3743937.0,-19322470.0,3744975.0,-19319660.0,85266.551966,99463.898797,3544.948847,308.0,3751014.0,-19093980.0,0
traj_0000a8602cf2def930488dee7cdad104_1_2,07:53:32,08:03:25,0.0,0.0,0.0,3744868.0,-19293560.0,3744816.0,-19292840.0,56962.437581,99463.898797,3544.948847,593.0,3751014.0,-19093980.0,0
traj_0000a8602cf2def930488dee7cdad104_1_3,08:17:50,08:37:23,0.0,0.0,0.0,3744880.0,-19292290.0,3744809.0,-19290490.0,55736.470981,99463.898797,3544.948847,1173.0,3751014.0,-19093980.0,0
traj_0000a8602cf2def930488dee7cdad104_1_4,14:38:09,14:38:09,0.0,0.0,0.0,3744909.0,-19285580.0,3744909.0,-19285580.0,49342.931074,99463.898797,3544.948847,0.0,3751014.0,-19093980.0,0


In [21]:
df["dist_pct_ch"] = df["dist"].pct_change()
df.fillna(0, inplace=True) # replace NANs with zeros

In [None]:
"""
Only retain those tag_ids corresponding to final locations i.e., between 3pm and 4pm
"""
final_df = pd.DataFrame(data=None, columns=df.columns.values)

for n,i in enumerate(unique_ids):
    final_df.loc[n] = df[df.index == i].iloc[-1]
final_df.set_index("trajectory_id", inplace=True)   

In [None]:
final_df.to_csv('data_test/all_features.csv')