## Imports

In [15]:
import pandas as pd
import numpy as np
from datetime import datetime
import time

## Load `train.csv`

In [16]:
df: pd.DataFrame = pd.read_csv('train.csv')

In [17]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


## Delete rows where `MISSING_DATA` is true and delete column `MISSING_DATA`

In [18]:
df.drop(df[df["MISSING_DATA"] == True].index, inplace=True)

In [19]:
df.drop(labels="MISSING_DATA", axis=1, inplace=True)

## Delete column `DAY_TYPE`

In [20]:
df.drop(labels="DAY_TYPE", axis=1, inplace=True)

## Set `ORIGIN_STAND` to 0 for entries with null `ORIGIN_STAND`

In [21]:
df.at[pd.isna(df["ORIGIN_STAND"]) == True, "ORIGIN_STAND"] = 0

## Set `ORIGIN_CALL` to 0 for entries with null `ORIGIN_CALL`

In [22]:
df.at[pd.isna(df["ORIGIN_CALL"]) == True, "ORIGIN_CALL"] = 0

## Delete column `TRIP_ID`

In [23]:
df.drop(labels="TRIP_ID", axis=1, inplace=True)

## Assigning integers to `CALL_TYPE`

In [24]:
df.at[df["CALL_TYPE"] == "A", "CALL_TYPE"] = 0
df.at[df["CALL_TYPE"] == "B", "CALL_TYPE"] = 1
df.at[df["CALL_TYPE"] == "C", "CALL_TYPE"] = 2
df["CALL_TYPE"] = df["CALL_TYPE"].astype(np.int32)

## Convert `TIMESTAMP` column to date + time + day of week

In [25]:
df["MONTH"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).month)
df["DAY"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).day)
df["WEEKDAY"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday())
df["HR"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
df["MIN"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).minute)
df.drop(labels="TIMESTAMP", axis=1, inplace=True)


## Convert `POLYLINE` column to `TARGET` column.

In [26]:
df["TARGET"] = df["POLYLINE"].apply(lambda x : 15 * max(x.count("[") - 1, 0))
df.drop(labels="POLYLINE", axis=1, inplace=True)

In [27]:
df.to_csv("processed_train.csv", index=False)