In [1]:
import pandas as pd
from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.preprocessing.filters import Filters as filters
from ptrail.features.kinematic_features import KinematicFeatures
from ptrail.features.temporal_features import TemporalFeatures
from ptrail.preprocessing.interpolation import Interpolation
from ptrail.utilities.conversions import Conversions as con
import ptrail.utilities.constants as const
import datetime as dt

# Reading the gulls dataset and converting to PTRAILDataFrame.
# Also, lets, print the first 5 points of the dataset to
# see how the dataframe looks.
gulls = pd.read_csv('https://raw.githubusercontent.com/YakshHaranwala/PTRAIL/main/examples/data/gulls.csv')
gulls = PTRAILDataFrame(gulls,
                        latitude='location-lat',
                        longitude='location-long',
                        datetime='timestamp',
                        traj_id='tag-local-identifier',
                        rest_of_columns=[])

In [2]:
pdf = pd.read_csv('https://raw.githubusercontent.com/YakshHaranwala/PTRAIL/main/examples/data/geolife_sample.csv')
geolife = PTRAILDataFrame(data_set=pdf,
                             latitude='lat',
                             longitude='lon',
                             datetime='datetime',
                             traj_id='id')


In [3]:
atlantic = pd.read_csv('https://raw.githubusercontent.com/YakshHaranwala/PTRAIL/main/examples/data/atlantic.csv')
atlantic = con.convert_directions_to_degree_lat_lon(atlantic, 'Latitude',"Longitude")
def convert_to_datetime(row):
        this_date = '{}-{}-{}'.format(str(row['Date'])[0:4], str(row['Date'])[4:6], str(row['Date'])[6:])
        this_time = '{:02d}:{:02d}:00'.format(int(row['Time']/100), int(str(row['Time'])[-2:]))
        return '{} {}'.format(this_date, this_time)
atlantic['DateTime'] = atlantic.apply(convert_to_datetime, axis=1)
atlantic = PTRAILDataFrame(atlantic,
                          latitude='Latitude',
                          longitude='Longitude',
                          datetime='DateTime',
                          traj_id='ID',
                          rest_of_columns=[])
atlantic = TemporalFeatures.create_date_column(atlantic)

In [4]:
pdf = pd.read_csv('https://raw.githubusercontent.com/YakshHaranwala/PTRAIL/main/examples/data/starkey.csv')
starkey = PTRAILDataFrame(data_set=pdf,
                          latitude='lat',
                          longitude='lon',
                          datetime='DateTime',
                          traj_id='Id')

In [5]:
t1 = gulls.reset_index().loc[gulls.reset_index()[const.TRAJECTORY_ID] == '91732'][[const.TRAJECTORY_ID, const.DateTime, const.LAT, const.LONG]]
t1 = PTRAILDataFrame(data_set=t1,
                     datetime='DateTime',
                     traj_id='traj_id',
                     latitude='lat',
                     longitude='lon')

In [6]:
def segment_traj_by_week(df):
    """
        Given a dataframe containing trajectory data, segment all
        the trajectories by each week.

        Parameters
        ----------
            df: PTRAILDataFrame
                The dataframe containing trajectory data.

        Returns
        -------
            pandas.core.dataframe.DataFrame:
                The dataframe containing segmented trajectories
                with a new column added called segment_id
    """
    # First, create the date column and get all the unique traj_ids
    # in the dataframe..
    df = TemporalFeatures.create_date_column(df)
    ids_ = list(df.traj_id.value_counts().keys())

    # Get the ideal number of IDs by which the dataframe is to be split.
    df_chunks = []
    for i in range(len(ids_)):
        small_df = df.reset_index().loc[df.reset_index()[const.TRAJECTORY_ID] == ids_[i]]
        df_chunks.append(small_df)

    # Now, iterate over the entire dataframe and then segment
    # the trajectories by 1 week each.
    results = []
    for i in range(len(ids_)):
        # Take the traj_df of a single Trajectory out from the
        # list of chunks and find their max and min timestamps.
        traj = df_chunks[i]
        t_max = traj.reset_index()[const.DateTime].max()
        t_min = traj.reset_index()[const.DateTime].min()

        # For iteration purposes, set t_1 to min and t_2 to
        # t_1 + 7 days.
        t_1 = t_min
        t_2 = t_1 + dt.timedelta(days=7)
        seg_id = 1

        # Now, segment the trajectories into smaller segments
        # wherein each segment contains the points of a span
        # of 7 days only.
        while t_2 < t_max:
            if t_2 < t_max:
                seg = filters.filter_by_date(traj,
                                             start_date=t_1.strftime('%Y-%m-%d'),
                                             end_date=t_2.strftime('%Y-%m-%d'))
                # Once filtered, assign the segment with a segment ID.
                seg['seg_id'] = seg_id

                # Increment the segment id, t_1 and t_2 values by
                # 1, 7 days and 7 days respectively to continue
                # the iteration.
                t_1 += dt.timedelta(days=7)
                t_2 += dt.timedelta(days=7)
                results.append(seg.drop(columns=['index', 'level_0']))

            # If, t_2 is greater than the max time present in the
            # trajectory, then assign t_2 = max and proceed
            # further with segmentation.
            elif t_2 >= t_max:
                seg = filters.filter_by_date(traj,
                                             start_date=t_1.strftime('%Y-%m-%d'),
                                             end_date=t_max.strftime('%Y-%m-%d'))
                # Once filtered, assign the segment with a segment ID.
                seg['seg_id'] = seg_id

                # Increment the segment id, t_1 and t_2 values by
                # 1, 7 days and 7 days respectively to continue
                # the iteration.
                t_1 += dt.timedelta(days=7)
                t_2 += dt.timedelta(days=7)
                results.append(seg.drop(columns=['index', 'level_0']))
            seg_id += 1

    # Finally, concat the dataframes, set the index as
    # [traj_id, seg_id, DateTime].
    return pd.concat(results).reset_index().set_index(['traj_id', 'seg_id', 'DateTime'])

In [7]:
%%time

seg = segment_traj_by_week(starkey)

CPU times: user 3min 44s, sys: 416 ms, total: 3min 44s
Wall time: 3min 44s


In [8]:
print(len(starkey))

287136


In [9]:
%%time

# Now, generating the stats of kinematic features for
# all the segments of the trajectories.

# First, splitting the dataframe into several parts
# wherein each part has a segment of the trajectory only.
ids_ = list(seg.reset_index()['traj_id'].value_counts().keys())

df_chunks = []
for i in range(len(ids_)):
    small_df = seg.reset_index().loc[seg.reset_index()[const.TRAJECTORY_ID] == ids_[i]]
    df_chunks.append(small_df)

final_chunks = []
for i in range(len(ids_)):
    seg_ids = list(df_chunks[i]['seg_id'].value_counts().keys())
    for j in range(len(seg_ids)):
        mini_df = df_chunks[i].loc[df_chunks[i]['seg_id'] == seg_ids[j]]
        final_chunks.append(mini_df)

print(len(final_chunks))

4976
CPU times: user 53.4 s, sys: 7.59 ms, total: 53.4 s
Wall time: 53.5 s


In [10]:
def segment_features(df):
    """
        Generate all the kinematic features on the smaller segments.

        Parameters
        ----------
            df: PTRAILDataFrame/pd.DataFrame
                The dataframe containing the trajectory segment.

        Returns
        -------
            pd.core.dataframe.DataFrame:
                The dataframe containing the segment kinematic
                feature stats.
    """
    seg_id = final_chunks[i]['seg_id'].iloc[0]
    stat_df = KinematicFeatures.generate_kinematic_stats(df)
    stat_df['seg_id'] = seg_id
    stat_df = stat_df.reset_index().set_index(['traj_id', 'seg_id', 'Columns'])

    return stat_df

In [11]:
%%time
# Now, generating kinematic features and their stats on each of
# the dataframe here.
import progressbar

final_results = []
bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
for i in range(len(final_chunks)):
    final_results.append(segment_features(final_chunks[i]))
    bar.update(i)

final_answer = pd.concat(final_results)

| |                                  #             | 4975 Elapsed Time: 0:23:08

CPU times: user 11min 48s, sys: 7min 19s, total: 19min 7s
Wall time: 23min 10s


In [12]:
final_answer.head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,std,min,10%,25%,50%,75%,90%,max
traj_id,seg_id,Columns,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9.10313e+42,1,Distance,170.9843,274.1726,0.0,29.92242,42.39008,94.65811,182.0309,334.7766,2636.556398
9.10313e+42,1,Distance_from_start,3145.115,1134.059,67.090679,1306.805,2670.747,3462.603,3779.523,4526.595,4898.249093
9.10313e+42,1,Speed,0.1282633,0.3343164,0.0,0.01679873,0.02441513,0.05059941,0.1027442,0.2046628,3.171325
9.10313e+42,1,Acceleration,5.894095e-05,0.0005653138,-0.001597,-6.218602e-05,-1.880435e-05,3.038938e-07,2.347378e-05,7.305094e-05,0.005417
9.10313e+42,1,Jerk,1.785475e-07,1.89618e-06,-6e-06,-5.969154e-08,-1.442411e-08,5.252046e-09,2.785344e-08,1.009022e-07,2.6e-05
9.10313e+42,1,Bearing,170.4456,107.0074,0.0,32.50344,77.54472,178.9034,259.3982,314.0006,358.909521
9.10313e+42,1,Bearing_Rate,-0.003246468,0.1582499,-0.845313,-0.1282694,-0.06152395,7.113328e-08,0.07041098,0.1268936,1.760332
9.10313e+42,1,Rate_of_bearing_rate,-0.003246468,0.1582499,-0.845313,-0.1282694,-0.06152395,7.113328e-08,0.07041098,0.1268936,1.760332
9.10313e+42,3,Distance,213.3147,362.6186,0.0,29.92231,59.84461,119.8888,217.8897,453.4294,3614.75463
9.10313e+42,3,Distance_from_start,2246.943,1040.147,108.142513,1214.26,1500.664,1924.909,2701.221,4069.313,5048.42082


In [None]:

#