# Preprocess raw data for counting vehicles visiting the selected spot.

## Read and preprocess vehicle GPS logs
- The input dataset is assumed to have fields below:
 - serial: ID of GPS loggers on vehicles.
 - tripcount: ID of trips between drives start and end.
 - tripid: Timestamps of trips start. 
 - tlm_datagettime: Timestamp of each lat/lon log.
 - lat: latitude logged by GPS.
 - lon: longitude logged by GPS.
- The field names 'serial' and 'tlm_datage' are renamed as 'ap_id' and 'timestamp' in function.py.

In [None]:
import pandas as pd
import os
from functions import read_input_data, prepare_trip_summary, merge_consecutive_trips_in_single_day

In [None]:
input_csv='2019-04_2019-08_GyokuSendo.csv'
output_dir = 'output/'
os.makedirs(output_dir, exist_ok=True)

preprocessed_data= output_dir + input_csv + '_staypoints.csv'
#daily_vehicle_csv= output_dir + input_csv + '_dailyvehicle.csv'

threshold_in_sec = 1800 # remove entries with stay_time < 1800 seconds

df = read_input_data( input_csv )
trip_df = prepare_trip_summary(df)
df_staypoints = merge_consecutive_trips_in_single_day(trip_df, threshold_in_sec)
#print(len(trip_df), len(df_final))

## Count and save daily unique vehicles

In [None]:
df_daily_unique_ap = df_staypoints[['date','ap_id']]
df_daily_unique_ap = df_daily_unique_ap.drop_duplicates(['date','ap_id'])
df_daily=df_daily_unique_ap[['date','ap_id']].groupby('date').agg(['count']).reset_index()
#df_daily.to_csv(daily_vehicle_csv ,  index = False)

In [None]:
df.head()

In [None]:
df_staypoints.head()

## separate timestamp fileds into smaller units like months and day

In [None]:
df_staypoints['date'] = pd.to_datetime(df_staypoints['date'])
df_staypoints['year']  = df_staypoints['date'].dt.year
df_staypoints['month'] = df_staypoints['date'].dt.month
df_staypoints['day']   = df_staypoints['date'].dt.dayofweek # Monday= 0, Tuesday=1 .... Sunday=6
#df_staypoints['day_num']  = df_staypoints['date'].dt.day 
#df_staypoints['date']  = df_staypoints['ts_car_start'].dt.date
df_staypoints.head()

## How many months and days

In [None]:
df_staypoints.month.unique(), df_staypoints.day.unique()

## total data points in SELECTED region

In [None]:
len(df_staypoints)

## how many cars

In [None]:
len(df_staypoints.ap_id.unique())

## number of cars by months

In [None]:
#df.groupby(['month']).agg(['mean', 'count'])
df_staypoints[['ap_id','month']].groupby(['month']).agg(['count'])

## number of cars by day

In [None]:
df_staypoints[['ap_id','month','day']].groupby(['day']).agg(['count'])

## How many total in each week day

In [None]:
df_staypoints.groupby('day').count()

## Prepare data formats and save in csv

In [None]:
df_preprocessed = df_staypoints[['ap_id','date', 'month', 'day']].groupby(['date', 'month', 'day']).count()
df_preprocessed.rename(columns = { 'ap_id' :'car_count'}, inplace = True)
# 'date', 'month', 'day' becomes index -> convert them to normal column
df_preprocessed.reset_index(inplace=True)
df_preprocessed.head()

In [None]:
len(df_preprocessed)

In [None]:
df_preprocessed.to_csv(preprocessed_data,index=False)