## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from datetime import datetime

## Load `train.csv`

In [2]:
df: pd.DataFrame = pd.read_csv('data/train.csv')

In [3]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


## Delete rows where `MISSING_DATA` is true and delete column `MISSING_DATA`

In [4]:
df.drop(df[df["MISSING_DATA"] == True].index, inplace=True)
df.drop(labels="MISSING_DATA", axis=1, inplace=True)

## Delete column `DAY_TYPE`

In [5]:
df.drop(labels="DAY_TYPE", axis=1, inplace=True)

## Delete column `TRIP_ID`

In [6]:
df.drop(labels="TRIP_ID", axis=1, inplace=True)

## Set `ORIGIN_STAND` and `ORIGIN_CALL` to 0 for entries with null `ORIGIN_STAND`

In [7]:
df.at[pd.isna(df["ORIGIN_CALL"]) == True, "ORIGIN_CALL"] = 0
df.at[pd.isna(df["ORIGIN_STAND"]) == True, "ORIGIN_STAND"] = 0
# TODO: encoding

In [8]:
# NOTE: EMBEDDINGS
LE_ORIGIN_CALL = LabelEncoder()
LE_ORIGIN_CALL.fit(df["ORIGIN_CALL"])
df["ORIGIN_CALL"] = LE_ORIGIN_CALL.transform(df["ORIGIN_CALL"])
LE_ORIGIN_STAND = LabelEncoder()
LE_ORIGIN_STAND.fit(df["ORIGIN_STAND"])
df["ORIGIN_STAND"] = LE_ORIGIN_STAND.transform(df["ORIGIN_STAND"])

## Assigning integers to `CALL_TYPE`

In [9]:
# TODO: one-hot encoding?
df = df.join(pd.get_dummies(df["CALL_TYPE"]))

In [10]:
df.drop(labels="CALL_TYPE", axis=1, inplace=True)

## Assign indices to TAXI_ID for embeddings

In [11]:
# TODO: set taxi id to indices
LE_TAXI_ID = LabelEncoder()
LE_TAXI_ID.fit(df["TAXI_ID"])
df["TAXI_ID"] = LE_TAXI_ID.transform(df["TAXI_ID"])

In [12]:
df.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,POLYLINE,A,B,C
0,0,0,367,1372636858,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",0,0,1
1,0,7,371,1372637303,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",0,1,0
2,0,0,204,1372636951,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",0,0,1
3,0,0,330,1372636854,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",0,0,1
4,0,0,217,1372637091,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",0,0,1


## Convert `TIMESTAMP` column to date + time + day of week

In [13]:
df["YEAR"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
df["MONTH"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).month)
df["DAY"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).day)
df["WEEKDAY"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday())
df["HR"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
df["MIN"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).minute)
df.drop(labels="TIMESTAMP", axis=1, inplace=True)


## Convert `POLYLINE` column to `TARGET` column.

In [14]:
df["TARGET"] = df["POLYLINE"].apply(lambda x : 15 * max(x.count("[") - 1, 0))
df.drop(labels="POLYLINE", axis=1, inplace=True)

## Get information for embeddings later

In [18]:
# print(f"CALL_TYPE: {len(df['CALL_TYPE'].unique())}")
print(f"ORIGIN_CALL: {len(df['ORIGIN_CALL'].unique())}")
print(f"ORIGIN_STAND: {len(df['ORIGIN_STAND'].unique())}")
print(f"TAXI_ID: {len(df['TAXI_ID'].unique())}")

ORIGIN_CALL: 57106
ORIGIN_STAND: 64
TAXI_ID: 448


## Export

In [17]:
df.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,A,B,C,YEAR,MONTH,DAY,WEEKDAY,HR,MIN,TARGET
0,0,0,367,0,0,1,2013,6,30,6,17,0,345
1,0,7,371,0,1,0,2013,6,30,6,17,8,285
2,0,0,204,0,0,1,2013,6,30,6,17,2,975
3,0,0,330,0,0,1,2013,6,30,6,17,0,645
4,0,0,217,0,0,1,2013,6,30,6,17,4,435


In [None]:
df.to_csv("data/processed_train.csv", index=False)