## Imports

In [31]:
import pandas as pd
import numpy as np
from datetime import datetime
import time

## Load `train.csv`

In [32]:
df: pd.DataFrame = pd.read_csv('data/train.csv')

In [33]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA
0,T1,B,,15.0,20000542,1408039037,A,False
1,T2,B,,57.0,20000108,1408038611,A,False
2,T3,B,,15.0,20000370,1408038568,A,False
3,T4,B,,53.0,20000492,1408039090,A,False
4,T5,B,,18.0,20000621,1408039177,A,False


## Delete rows where `MISSING_DATA` is true and delete column `MISSING_DATA`

In [34]:
df.drop(df[df["MISSING_DATA"] == True].index, inplace=True)

In [35]:
df.drop(labels="MISSING_DATA", axis=1, inplace=True)

## Delete column `DAY_TYPE`

In [36]:
df.drop(labels="DAY_TYPE", axis=1, inplace=True)

## Set `ORIGIN_STAND` to 0 for entries with null `ORIGIN_STAND`

In [37]:
df.at[pd.isna(df["ORIGIN_STAND"]) == True, "ORIGIN_STAND"] = 0

## Set `ORIGIN_CALL` to 0 for entries with null `ORIGIN_CALL`

In [38]:
df.at[pd.isna(df["ORIGIN_CALL"]) == True, "ORIGIN_CALL"] = 0

## Delete column `TRIP_ID`

In [39]:
df.drop(labels="TRIP_ID", axis=1, inplace=True)

## Assigning integers to `CALL_TYPE`

In [40]:
df.at[df["CALL_TYPE"] == "A", "CALL_TYPE"] = 0
df.at[df["CALL_TYPE"] == "B", "CALL_TYPE"] = 1
df.at[df["CALL_TYPE"] == "C", "CALL_TYPE"] = 2
df["CALL_TYPE"] = df["CALL_TYPE"].astype(np.int32)

## Convert `TIMESTAMP` column to date + time + day of week

In [41]:
df["MONTH"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).month)
df["DAY"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).day)
df["WEEKDAY"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday())
df["HR"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
df["MIN"] = df["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).minute)
df.drop(labels="TIMESTAMP", axis=1, inplace=True)


## Convert `POLYLINE` column to `TARGET` column.

In [42]:
df["TARGET"] = df["POLYLINE"].apply(lambda x : 15 * max(x.count("[") - 1, 0))
df.drop(labels="POLYLINE", axis=1, inplace=True)

KeyError: 'POLYLINE'

In [43]:
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,MONTH,DAY,WEEKDAY,HR,MIN
0,1,0.0,15.0,20000542,8,14,3,10,57
1,1,0.0,57.0,20000108,8,14,3,10,50
2,1,0.0,15.0,20000370,8,14,3,10,49
3,1,0.0,53.0,20000492,8,14,3,10,58
4,1,0.0,18.0,20000621,8,14,3,10,59


In [46]:
df.to_csv("data/processed_train.csv", index=False)