### Introduction
Continuing on from the collected data, we review and perform the necessary data cleaning.

In [3]:
# Import libraries
import pandas as pd
import numpy as np
from ast import literal_eval
pd.set_option('display.max_columns', None)

In [4]:
# convert the column during import
df = pd.read_csv('2022Apr.csv', converters={'geometry.coordinates': literal_eval}, index_col=0)

In [7]:
# reset index
df.reset_index(drop=True, inplace=True)

In [9]:
df.head(1)

Unnamed: 0,type,geometry.type,geometry.coordinates,properties.timestamp,properties.taxi_count,properties.api_info.status,time
0,Feature,MultiPoint,"[[103.63974, 1.32945], [103.65257, 1.31732], [...",2022-03-31T23:51:53+08:00,2434,healthy,2022-04-01 00:00:00


In [10]:
# check geometry coordinates
df['geometry.coordinates'][0]

[[103.63974, 1.32945],
 [103.65257, 1.31732],
 [103.65571, 1.32321],
 [103.65583, 1.32328],
 [103.65641, 1.31544],
 [103.66221, 1.31078],
 [103.66629, 1.32211],
 [103.66686, 1.31223],
 [103.67004, 1.3222],
 [103.67005, 1.32344],
 [103.67426, 1.32418],
 [103.67681, 1.32663],
 [103.67851, 1.32715],
 [103.68294, 1.33],
 [103.6856, 1.32965],
 [103.685928133333, 1.34051135],
 [103.68721, 1.31317],
 [103.68732, 1.35],
 [103.68773, 1.31],
 [103.68883, 1.34],
 [103.688955166667, 1.34319666666667],
 [103.68975, 1.34937],
 [103.69041, 1.34216],
 [103.69115, 1.3462],
 [103.69152, 1.31496],
 [103.69215, 1.32323],
 [103.692743733333, 1.33846771666667],
 [103.6933, 1.34],
 [103.69353, 1.34],
 [103.69511, 1.34],
 [103.69534, 1.35179],
 [103.69604, 1.34962],
 [103.69621, 1.35],
 [103.696228166667, 1.3453775],
 [103.69623, 1.31501],
 [103.6966, 1.34323],
 [103.69662, 1.34],
 [103.69669, 1.38],
 [103.6973, 1.34],
 [103.69866, 1.34155],
 [103.69886, 1.3368],
 [103.6989857, 1.35492261666667],
 [103.69995,

Each coordinate pair are arranged as longitude and latitude values. 
Each pair indicates the presence of the taxi. For simplicity, we segregate Singapore into 3x3 grid. We may estimate the ranges of latitude and longitude using this site https://gps-coordinates.org/singapore-latitude.php. 
* For the purpose of this project, the latitude ranges from 1.15 to 1.48, longitude ranges from 103.6 to 104.1.
* Starting from the top left corner of the 3 x 3 grid, the grids would run from sectors1,2,3 to 4,5,6 for the second row and lastly, 7,8,9.
* As an example, lat more than 1.37 and long less than 103.77 would be sector1.

In [13]:
# 9 empty list to hold taxi counts
s1, s2, s3, s4, s5, s6, s7, s8, s9 = [], [], [], [], [], [], [], [], []
# loop through the list of coordinates in geometry.coordinates column
for coords in df['geometry.coordinates']:
    # 9 counters starting from 0 to track taxi count
    s1cnt, s2cnt, s3cnt, s4cnt, s5cnt, s6cnt, s7cnt, s8cnt, s9cnt = 0,0,0,0,0,0,0,0,0
    # loop through the coordinate pairs in the list of coordinates
    for pair in coords:
        # if long is less than or equal to 103.77 and lat is more than or equal to 1.37
        if pair[0] <= 103.77 and pair[1] >=1.37: # taxi in sector 1
            s1cnt+=1 
        # elseif long is less than 103.77 and lat is more than or equal 1.26 and is less than 1.37
        elif pair[0] <=103.77 and pair[1] >=1.26 and pair[1]<1.37: # taxi in sector 4
            s4cnt+=1 
        elif pair[0] <=103.77 and pair[1] <1.26: # sector7
            s7cnt+=1
        elif pair[0] <= 109.93 and pair[0]>103.77 and pair[1] >=1.37: # sector2
            s2cnt+=1
        elif pair[0] <= 109.93 and pair[0]>103.77 and pair[1] >=1.26 and pair[1]<1.37: # sector5
            s5cnt+=1
        elif pair[0] <= 109.93 and pair[0]>103.77 and pair[1] <1.26: # sector8
            s8cnt+=1
        elif pair[0] > 109.93 and pair[1] >=1.37: # sector3
            s3cnt+=1
        elif pair[0] > 109.93 and pair[1] >=1.26 and pair[1]<1.37: # sector6
            s6cnt+=1
        elif pair[0] > 109.93 and pair[1] <1.26: # sector9
            s9cnt+=1
    # append the counts of the free taxis in each in the nine lists
    s1.append(s1cnt)
    s2.append(s2cnt)
    s3.append(s3cnt)
    s4.append(s4cnt)
    s5.append(s5cnt)
    s6.append(s6cnt)
    s7.append(s7cnt)
    s8.append(s8cnt)
    s9.append(s9cnt)

In [14]:
# populate into dataframe
df['sector_1'] = s1
df['sector_2'] = s2
df['sector_3'] = s3
df['sector_4'] = s4
df['sector_5'] = s5
df['sector_6'] = s6
df['sector_7'] = s7
df['sector_8'] = s8
df['sector_9'] = s9

In [15]:
#convert "time" column into DateTime object
df['time'] = pd.to_datetime(df['time'])
df['time'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 8353 entries, 0 to 8352
Series name: time
Non-Null Count  Dtype         
--------------  -----         
8353 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 65.4 KB


In [16]:
# get dayofweek, minute, hour from 'time' column
df['day_of_week'] = df.time.dt.day_of_week
df['minute'] = df.time.dt.minute
df['hour'] = df.time.dt.hour

In [17]:
# get only the relevant columns 
cols_use = ['time', 'properties.taxi_count', 'day_of_week', 'minute', 'hour',
            'sector_1', 'sector_2', 'sector_3', 
            'sector_4', 'sector_5', 'sector_6',
            'sector_7', 'sector_8', 'sector_9',
           ]
use_df = df[cols_use].copy()
use_df.shape

(8353, 14)

In [18]:
# export as csv
use_df.to_csv('use.csv')