# Preprocess raw data for selected spot (E.g. Gyoku Sendo)

1. Three months parkingbreak data is saved in *data/* dir:
    - gzip -dc /mnt/lv/heromiya/OkinawaVisitorPred/2018-12_2019-04_2019-08.csv.gz | awk 'BEGIN{FS=","}$18==1{print}' > data/parkingbreak1-3month.csv


2.  Subset the point data around the tourism spot 'Gukyo Sendo' and save as *data/sendoRegion_3months.csv*. The selected region of Gukyo Sendo is around 127.748361,26.139007 (Exampel Extent: 127.74563156,26.13900734,127.75200790,26.14219240)
    - FIgure below shows the *extent of selected data*.
    ![GyokuSendo.png](data/GyokuSendo.png)

### Read and preprocess *2019-04_2019-08_GyokuSendo.csv*
- The field names'serial' and 'tlm_datage' are renamed as 'ap_id' and 'timestamp'

In [1]:
import pandas as pd
from functions import read_input_data, prepare_trip_summary, merge_consecutive_trips_in_single_day

In [2]:
threshold_in_sec = 1800 # # remove entries with stay_time < 1800 seconds
input_csv='/mnt/lv/bidur/OkinawaVisitorPred/data/2019-04_2019-08_GyokuSendo.csv'
preprocessed_data='/mnt/lv/bidur/OkinawaVisitorPred/data/2019-04_2019-08_GyokuSendo_staypoints.csv'
df = read_input_data( input_csv )
trip_df = prepare_trip_summary(df)
df_staypoints = merge_consecutive_trips_in_single_day(trip_df, threshold_in_sec)
#print(len(trip_df), len(df_final))

In [3]:
df_staypoints.head()

Unnamed: 0,ap_id,date,trip_count_prevs,trip_count,ts_prevs_stop,ts_start,stay_time,lon,lat
0,AP521745,2019-04-01,2080.0,2081.0,2019-04-01 08:49:43,2019-04-01 11:14:20,8677.0,127.748598,26.141213
1,AP521745,2019-04-04,2101.0,2102.0,2019-04-04 12:15:45,2019-04-04 18:05:37,20992.0,127.747382,26.141103
2,AP521745,2019-04-08,2123.0,2124.0,2019-04-08 11:23:41,2019-04-08 13:51:25,8864.0,127.749557,26.141103
3,AP521745,2019-04-13,2152.0,2153.0,2019-04-13 10:59:40,2019-04-13 12:58:27,7127.0,127.750141,26.140792
4,AP521745,2019-04-25,2269.0,2270.0,2019-04-25 10:32:25,2019-04-25 14:24:14,13909.0,127.748954,26.141355


### separate timestamp fileds into smaller units like months and day

In [4]:
df_staypoints['date'] = pd.to_datetime(df_staypoints['date'])
df_staypoints['year']  = df_staypoints['date'].dt.year
df_staypoints['month'] = df_staypoints['date'].dt.month
df_staypoints['day']   = df_staypoints['date'].dt.dayofweek # Monday= 0, Tuesday=1 .... Sunday=6
#df_staypoints['day_num']  = df_staypoints['date'].dt.day 
#df_staypoints['date']  = df_staypoints['ts_car_start'].dt.date
df_staypoints.head()

Unnamed: 0,ap_id,date,trip_count_prevs,trip_count,ts_prevs_stop,ts_start,stay_time,lon,lat,year,month,day
0,AP521745,2019-04-01,2080.0,2081.0,2019-04-01 08:49:43,2019-04-01 11:14:20,8677.0,127.748598,26.141213,2019,4,0
1,AP521745,2019-04-04,2101.0,2102.0,2019-04-04 12:15:45,2019-04-04 18:05:37,20992.0,127.747382,26.141103,2019,4,3
2,AP521745,2019-04-08,2123.0,2124.0,2019-04-08 11:23:41,2019-04-08 13:51:25,8864.0,127.749557,26.141103,2019,4,0
3,AP521745,2019-04-13,2152.0,2153.0,2019-04-13 10:59:40,2019-04-13 12:58:27,7127.0,127.750141,26.140792,2019,4,5
4,AP521745,2019-04-25,2269.0,2270.0,2019-04-25 10:32:25,2019-04-25 14:24:14,13909.0,127.748954,26.141355,2019,4,3


#### How many months and days

In [5]:
df_staypoints.month.unique(), df_staypoints.day.unique()

(array([4, 8]), array([0, 3, 5, 4, 2, 6, 1]))

### total data points in GyokuSendo region

In [6]:
len(df_staypoints)

4847

### how many cars

In [7]:
len(df_staypoints.ap_id.unique())

1488

### number of cars by months

In [8]:
#df.groupby(['month']).agg(['mean', 'count'])
df_staypoints[['ap_id','month']].groupby(['month']).agg(['count'])

Unnamed: 0_level_0,ap_id
Unnamed: 0_level_1,count
month,Unnamed: 1_level_2
4,2289
8,2558


### number of cars by day

In [9]:
df_staypoints[['ap_id','month','day']].groupby(['day']).agg(['count'])

Unnamed: 0_level_0,ap_id,month
Unnamed: 0_level_1,count,count
day,Unnamed: 1_level_2,Unnamed: 2_level_2
0,706,706
1,706,706
2,680,680
3,784,784
4,648,648
5,682,682
6,641,641


### How many total in each week day

In [10]:
df_staypoints.groupby('day').count()

Unnamed: 0_level_0,ap_id,date,trip_count_prevs,trip_count,ts_prevs_stop,ts_start,stay_time,lon,lat,year,month
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,706,706,706,706,706,706,706,706,706,706,706
1,706,706,706,706,706,706,706,706,706,706,706
2,680,680,680,680,680,680,680,680,680,680,680
3,784,784,784,784,784,784,784,784,784,784,784
4,648,648,648,648,648,648,648,648,648,648,648
5,682,682,682,682,682,682,682,682,682,682,682
6,641,641,641,641,641,641,641,641,641,641,641


### Prepare desired data and save in csv

In [11]:
df_preprocessed = df_staypoints[['ap_id','date', 'month', 'day']].groupby(['date', 'month', 'day']).count()
df_preprocessed.rename(columns = { 'ap_id' :'car_count'}, inplace = True)
# 'date', 'month', 'day' becomes index -> convert them to normal column
df_preprocessed.reset_index(inplace=True)
df_preprocessed.head()

Unnamed: 0,date,month,day,car_count
0,2019-04-01,4,0,72
1,2019-04-02,4,1,66
2,2019-04-03,4,2,103
3,2019-04-04,4,3,89
4,2019-04-05,4,4,58


In [12]:
len(df_preprocessed)

61

In [13]:
df_preprocessed.to_csv(preprocessed_data,index=False)

In [14]:
df_preprocessed.head()

Unnamed: 0,date,month,day,car_count
0,2019-04-01,4,0,72
1,2019-04-02,4,1,66
2,2019-04-03,4,2,103
3,2019-04-04,4,3,89
4,2019-04-05,4,4,58
