# 2. Data Processing


In the following section, we will process the data to enhance opportunities for further analysis. We will divide the data into 5-minute intervals, create a GeoDataFrame to work with this dataset as spatial data, create lines (directions) for each object based on point data, and calculate various statistics for each object.


Import libraries


In [3]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import LineString

Reading data


Reading the result from the previois section


In [4]:
data_cleaned = pd.read_csv('data/1_data_cleaned.csv')

### Dividing data into classes based on 5 minutes interval


We are dividing our dataset into 5-minutes interval for the final estimations on the data quality assesment


In [5]:
min_timestamp = data_cleaned['timestamp'].min()

milliseconds = pd.Timedelta(minutes=5).total_seconds() * 1000

data_cleaned['time_class'] = ((data_cleaned['timestamp'] - min_timestamp) / milliseconds).astype(int)

data_cleaned.head()

Unnamed: 0.1,Unnamed: 0,object_id,timestamp,heading,height,width,length,v,tracking_status,object_type,lon,lat,time_class
0,0,152997118,1712062811083,132.072,0.735,1.942,4.387,0.03,TRACKING,CAR,13.064405,47.810136,0
1,1,152997118,1712062811183,132.286,0.731,1.861,4.32,0.04,TRACKING,CAR,13.064405,47.810136,0
2,2,152997118,1712062811283,132.229,0.739,1.955,4.395,0.03,TRACKING,CAR,13.064405,47.810136,0
3,3,152997118,1712062811386,131.98,0.729,1.942,4.367,0.04,TRACKING,CAR,13.064405,47.810135,0
4,4,152997118,1712062811485,132.02,0.737,1.875,4.328,0.02,TRACKING,CAR,13.064405,47.810136,0


### Creating GeoData


Based on coordinates provided in a csv file, we are creating points as spatial data - geoDataFrame


In [6]:
points_gdf = gpd.GeoDataFrame(data_cleaned, geometry=gpd.points_from_xy(data_cleaned['lon'], data_cleaned['lat']), crs=4326)

points_gdf.head()

Unnamed: 0.1,Unnamed: 0,object_id,timestamp,heading,height,width,length,v,tracking_status,object_type,lon,lat,time_class,geometry
0,0,152997118,1712062811083,132.072,0.735,1.942,4.387,0.03,TRACKING,CAR,13.064405,47.810136,0,POINT (13.06440 47.81014)
1,1,152997118,1712062811183,132.286,0.731,1.861,4.32,0.04,TRACKING,CAR,13.064405,47.810136,0,POINT (13.06441 47.81014)
2,2,152997118,1712062811283,132.229,0.739,1.955,4.395,0.03,TRACKING,CAR,13.064405,47.810136,0,POINT (13.06440 47.81014)
3,3,152997118,1712062811386,131.98,0.729,1.942,4.367,0.04,TRACKING,CAR,13.064405,47.810135,0,POINT (13.06441 47.81014)
4,4,152997118,1712062811485,132.02,0.737,1.875,4.328,0.02,TRACKING,CAR,13.064405,47.810136,0,POINT (13.06441 47.81014)


#### Calcualte the distance between the first and the last point


As one of the characteristics for the analysis we are caluclating the distance between the first and the last point for each object


In [7]:
points_gdf = points_gdf.to_crs(points_gdf.estimate_utm_crs())

# Function to calculate distance between first and last points
def calculate_distance(group):
    if len(group) > 1:
        first_point = group.head(1)['geometry'].iloc[0]
        last_point = group.tail(1)['geometry'].iloc[0]
        return first_point.distance(last_point)
    else:
        return None

# Group by object_id and calculate distances
distances = points_gdf.groupby('object_id').apply(calculate_distance).reset_index(name='distance_first_last')

# Merge distances back into the original GeoDataFrame
points_gdf = points_gdf.merge(distances, on='object_id', how='left')

points_gdf = points_gdf.to_crs(epsg=4326)

points_gdf.head()

Unnamed: 0.1,Unnamed: 0,object_id,timestamp,heading,height,width,length,v,tracking_status,object_type,lon,lat,time_class,geometry,distance_first_last
0,0,152997118,1712062811083,132.072,0.735,1.942,4.387,0.03,TRACKING,CAR,13.064405,47.810136,0,POINT (13.06440 47.81014),0.165151
1,1,152997118,1712062811183,132.286,0.731,1.861,4.32,0.04,TRACKING,CAR,13.064405,47.810136,0,POINT (13.06441 47.81014),0.165151
2,2,152997118,1712062811283,132.229,0.739,1.955,4.395,0.03,TRACKING,CAR,13.064405,47.810136,0,POINT (13.06440 47.81014),0.165151
3,3,152997118,1712062811386,131.98,0.729,1.942,4.367,0.04,TRACKING,CAR,13.064405,47.810135,0,POINT (13.06441 47.81014),0.165151
4,4,152997118,1712062811485,132.02,0.737,1.875,4.328,0.02,TRACKING,CAR,13.064405,47.810136,0,POINT (13.06441 47.81014),0.165151


#### Calcualte average heading and velocity change


As one of the characteristics for the analysis we are caluclating average heading and velocity change for each object to explore possible anomalies based on this parameters


In [8]:
def calculate_average_change(df, value_field):

    change_field = f'{value_field}_change' 

    avg_change_field = f'avg_{value_field}_change' 

    # Calculate heading change between consecutive points
    df[change_field] = df.groupby('object_id')[value_field].diff().fillna(0)

    # Calculate average heading change per object
    avg_change = df.groupby('object_id')[change_field].mean()

    # Update the original DataFrame with average heading change
    df = df.merge(avg_change.rename(avg_change_field), on='object_id', how='left')

    return df

In [9]:
# Calculate and update average heading change for each object in the DataFrame
points_gdf = calculate_average_change(points_gdf, 'heading')
points_gdf = calculate_average_change(points_gdf, 'v')

### Creating Lines from Point Observations


Creating lines from point observation to see the actual path for each object


In [10]:
# Group by 'object_id' and create LineString geometries
def create_linestring(group):
    if len(group) > 1:
        return LineString(group)
    return None


# Group by 'object_id' and create LineString geometries
lines = points_gdf.groupby('object_id')['geometry'].apply(lambda x: create_linestring(x.tolist()))


# Create a new GeoDataFrame for the lines
lines_gdf = gpd.GeoDataFrame(lines, geometry='geometry', crs=4326)
lines_gdf = lines_gdf.reset_index().rename(columns={'index': 'object_id'})
lines_gdf = lines_gdf[lines_gdf['geometry'].notnull()]


### Calculate Statistics for each line


Calculating different statistics for each line such as: overall time of movement, average heading, velocity, width, height and length


In [12]:
# Calculate the time difference (max - min timestamp) for each object_id
time_diff = points_gdf.groupby('object_id')['timestamp'].apply(lambda x: (x.max() - x.min()))

# Calculate average heading, average velocity, average width, average length for each object_id
avg_heading = points_gdf.groupby('object_id')['heading'].median()
avg_v = points_gdf.groupby('object_id')['v'].median()
avg_width = points_gdf.groupby('object_id')['width'].median()
avg_length = points_gdf.groupby('object_id')['length'].median()
avg_height = points_gdf.groupby('object_id')['height'].median()

# Combine the calculated statistics into a new DataFrame
statistics_df = pd.DataFrame({'total_time': time_diff, 'avg_heading': avg_heading, 'avg_v': avg_v, 'avg_width': avg_width, 'avg_length': avg_length, 'avg_height': avg_height})

Adding important Object characteristics from an initial DataFrame


In [13]:
# Add object_type column to the new DataFrame
statistics_df['object_type'] = points_gdf.groupby('object_id')['object_type'].first().values

# Add time_class column to the new DataFrame
statistics_df['time_class'] = points_gdf.groupby('object_id')['time_class'].first().values

# Add average heading change column to the new DataFrame
statistics_df['avg_heading_change'] = points_gdf.groupby('object_id')['avg_heading_change'].first().values

# Add average velocity change column to the new DataFrame
statistics_df['avg_v_change'] = points_gdf.groupby('object_id')['avg_v_change'].first().values

# Add distance between first and last points to the new DataFrame
statistics_df['distance_first_last'] = points_gdf.groupby('object_id')['distance_first_last'].first().values


Сalculate whether the type of the same object has changed


In [14]:
# Add a column with all unique object types for each object_id
unique_object_types = points_gdf.groupby('object_id')['object_type'].apply(lambda x: ', '.join(x.unique()))
statistics_df['unique_object_types'] = unique_object_types

# Add columns for each object_type and its occurrences
object_type_counts = points_gdf.groupby(['object_id', 'object_type']).size().unstack(fill_value=0)
statistics_df = statistics_df.join(object_type_counts, on='object_id')

# Calculate the percentage of each object type for each object_id
object_type_percentages = object_type_counts.div(object_type_counts.sum(axis=1), axis=0) * 100

# Determine the object type with the highest percentage for each object_id
top_object_type = object_type_percentages.idxmax(axis=1)
statistics_df['object_type'] = top_object_type

# Calculate the percentage of the top object_type for each object_id
top_object_type_percentage = object_type_percentages.max(axis=1)
statistics_df['top_object_type_percentage'] = top_object_type_percentage

# Reset index to have object_id as a column instead of index
statistics_df.reset_index(inplace=True)

Merging Statistics to GeoDataFrames with lines for each object


In [15]:
lines_gdf_stat = lines_gdf.merge(statistics_df, on='object_id', how='left')

Calculating total time and route length


In [16]:
#Calculating Total Time
lines_gdf_stat['total_time_s'] = lines_gdf_stat['total_time']/1000

#Calculating Route Length
lines_gdf_stat = lines_gdf_stat.to_crs(lines_gdf.estimate_utm_crs())
lines_gdf_stat['route_length'] = lines_gdf_stat['geometry'].length

#Calculating Ratio between Route Length and distance between the first and the last point
lines_gdf_stat['direct_real_d_ratio'] = lines_gdf_stat['route_length']/lines_gdf_stat['distance_first_last']

lines_gdf_stat = lines_gdf_stat.to_crs(epsg=4326)

lines_gdf_stat.head()

Unnamed: 0,object_id,geometry,total_time,avg_heading,avg_v,avg_width,avg_length,avg_height,object_type,time_class,...,avg_v_change,distance_first_last,unique_object_types,CAR,CYCLIST,PEDESTRIAN,top_object_type_percentage,route_length,total_time_s,direct_real_d_ratio
0,152997118,"LINESTRING (13.06440 47.81014, 13.06441 47.810...",286302,133.78,0.03,1.869,4.387,1.151,CAR,0,...,4e-06,0.165151,CAR,2748,0,0,100.0,102.985753,286.302,623.58591
1,152997181,"LINESTRING (13.06399 47.81006, 13.06399 47.810...",138700,148.052,0.77,0.559,0.5585,1.332,PEDESTRIAN,0,...,-0.000261,65.637374,PEDESTRIAN,0,0,1342,100.0,110.534391,138.7,1.684016
2,152997182,"LINESTRING (13.06413 47.81005, 13.06413 47.810...",270701,139.9335,0.02,1.912,4.523,1.69,CAR,0,...,4e-06,0.086359,CAR,2616,0,0,100.0,60.296137,270.701,698.199732
3,152997183,"LINESTRING (13.06339 47.80977, 13.06339 47.809...",101799,105.466,0.01,1.872,4.867,1.566,CAR,0,...,0.009214,119.628432,CAR,980,0,0,100.0,144.550702,101.799,1.208331
4,152997184,"LINESTRING (13.06339 47.80978, 13.06340 47.809...",100897,97.4585,0.01,1.974,5.0,1.615,CAR,0,...,0.006307,111.192417,CAR,972,0,0,100.0,144.484505,100.897,1.29941


In this section, we processed the data, created geodataframe from the CSV file, and calculated summary parameters for further analysis.


### Saving Final Result


In [18]:
#lines_gdf_stat.to_file('data/2_lines_gdf_stat.gpkg')
#points_gdf.to_file('data/2_points_gdf.gpkg')