In [None]:
import pandas as pd
from sklearn import preprocessing, metrics
import numpy as np
from datetime import datetime
import ast

In [None]:
df_train: pd.DataFrame = pd.read_csv("./data/train.csv")
df_test: pd.DataFrame = pd.read_csv("./data/test_public.csv")
df_meta: pd.DataFrame = pd.read_csv("./data/metaData_taxistandsID_name_GPSlocation.csv")


### Get starting coordinates

In [None]:
def parse_row_polyline():
    start_coords = np.zeros((len(df_train), 2))
    end_coords = np.zeros((len(df_train), 2))
    for i in range(len(df_train)):
        row = ast.literal_eval(df_train.iloc[i]["POLYLINE"])
        if len(row) == 0:
            continue
        start_coords[i] = row[0]
        end_coords[i] = row[-1]
    return start_coords, end_coords

# start_coords, end_coords = parse_row_polyline()
# np.save("start_full.npy", start_coords)
# np.save("end_full.npy", end_coords)
start_coords = np.load("start_full.npy")
end_coords = np.load("end_full.npy")


In [None]:
df_train["START_LONG"] = start_coords[:, 0]
df_train["START_LAT"] = start_coords[:, 1]
df_train["END_LONG"] = end_coords[:, 0]
df_train["END_LAT"] = end_coords[:, 1]

In [None]:
df_train.head()

In [None]:
import matplotlib.pyplot as plt
central_long = df_train[df_train["CALL_TYPE"] == "A"]["START_LONG"].median()
central_lat = df_train[df_train["CALL_TYPE"] == "A"]["START_LAT"].median()

### Delete useless rows/Columns

In [None]:
# Remove rows with MISSING_DATA == true
df_train.drop(df_train[df_train["MISSING_DATA"] == True].index, inplace=True)
df_train.drop(labels="MISSING_DATA", axis=1, inplace=True)
# Not necessary for test set
df_test.drop(labels="MISSING_DATA", axis=1, inplace=True)

# Delete column DAY_TYPE
df_train.drop(labels="DAY_TYPE", axis=1, inplace=True)
df_test.drop(labels="DAY_TYPE", axis=1, inplace=True)

# Delete column TRIP_ID
df_train.drop(labels="TRIP_ID", axis=1, inplace=True)
df_test.drop(labels="TRIP_ID", axis=1, inplace=True)


### Encode categorical

In [None]:
# one-hot encoding for call types
df_train = df_train.join(pd.get_dummies(df_train["CALL_TYPE"]))
df_train.drop(labels="CALL_TYPE", axis=1, inplace=True)

df_test = df_test.join(pd.get_dummies(df_test["CALL_TYPE"]))
df_test.drop(labels="CALL_TYPE", axis=1, inplace=True)

In [None]:
# Taxi id's -> indices
LE_TAXI_ID = preprocessing.LabelEncoder()
LE_TAXI_ID.fit(df_train["TAXI_ID"])
df_train["TAXI_ID"] = LE_TAXI_ID.transform(df_train["TAXI_ID"])

df_test["TAXI_ID"] = LE_TAXI_ID.transform(df_test["TAXI_ID"])

In [None]:
# ORIGIN_CALL's -> indices
df_train.at[pd.isna(df_train["ORIGIN_CALL"]) == True, "ORIGIN_CALL"] = 0
val_cnt = df_train["ORIGIN_CALL"].value_counts().to_dict()
df_train["ORIGIN_CALL"] = df_train["ORIGIN_CALL"].map(lambda x : x  if val_cnt[x] > 1 else 0 )

multi_occurences = set(df_train["ORIGIN_CALL"].unique())
new_col = df_test["ORIGIN_CALL"]
for i, x in enumerate(new_col):
    if x not in multi_occurences:
        new_col[i] = 0

df_test["ORIGIN_CALL"] = new_col

LE_ORIGIN_CALL = preprocessing.LabelEncoder()
LE_ORIGIN_CALL.fit(df_train["ORIGIN_CALL"])
df_train["ORIGIN_CALL"] = LE_ORIGIN_CALL.transform(df_train["ORIGIN_CALL"])
df_test["ORIGIN_CALL"] = LE_ORIGIN_CALL.transform(df_test["ORIGIN_CALL"])


In [None]:
df_train["ORIGIN_CALL"].value_counts()

In [None]:
# Encode ORIGIN_STAND
LE_ORIGIN_STAND = preprocessing.LabelEncoder()
LE_ORIGIN_STAND.fit(df_train["ORIGIN_STAND"])
df_train["ORIGIN_STAND"] = LE_ORIGIN_STAND.transform(df_train["ORIGIN_STAND"])
df_test["ORIGIN_STAND"] = LE_ORIGIN_STAND.transform(df_test["ORIGIN_STAND"])

### Datetime

In [None]:
df_train["YEAR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
df_train["WK_OF_YR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).isocalendar().week)
df_train["WK_DAY"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday()) # WHY ARE YOU LIKE THIS??? ALL THE OTHERS ARE FIELDS
df_train["HR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
df_train.drop(labels="TIMESTAMP", axis=1, inplace=True)


df_test["YEAR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
df_test["WK_OF_YR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).isocalendar().week)
df_test["WK_DAY"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday()) # WHY ARE YOU LIKE THIS??? ALL THE OTHERS ARE FIELDS
df_test["HR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
df_test.drop(labels="TIMESTAMP", axis=1, inplace=True)
# TODO: use timestamps for NN instead of whatever the fuck this is

### Create target column

In [None]:
df_train["TARGET"] = df_train["POLYLINE"].apply(lambda x : 15 * max(x.count("[") - 1, 0))
df_train.drop(labels="POLYLINE", axis=1, inplace=True)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.to_csv("./data/processed_trainv6.csv", index=False)
df_test.to_csv("./data/processed_testv6.csv", index=False)

### Continue in `finalize.ipynb`