## Imports

In [112]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from datetime import datetime


## Load `train.csv`

In [113]:
df_train: pd.DataFrame = pd.read_csv("../data/train.csv")

df_test: pd.DataFrame = pd.read_csv("../data/test_public.csv")

In [114]:
df_train.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


### Delete useless rows/Columns

In [115]:
# Remove rows with MISSING_DATA == true
df_train.drop(df_train[df_train["MISSING_DATA"] == True].index, inplace=True)
df_train.drop(labels="MISSING_DATA", axis=1, inplace=True)
# Not really necessary to remove rows for test set, do remove columns though :)
df_test.drop(labels="MISSING_DATA", axis=1, inplace=True)

# Delete column DAY_TYPE
df_train.drop(labels="DAY_TYPE", axis=1, inplace=True)
df_test.drop(labels="DAY_TYPE", axis=1, inplace=True)

# Delete column TRIP_ID
df_train.drop(labels="TRIP_ID", axis=1, inplace=True)
df_test.drop(labels="TRIP_ID", axis=1, inplace=True)


### Encode categorical

In [116]:
df_train.at[pd.isna(df_train["ORIGIN_CALL"]) == True, "ORIGIN_CALL"] = 0
df_train.at[pd.isna(df_train["ORIGIN_STAND"]) == True, "ORIGIN_STAND"] = 0

df_test.at[pd.isna(df_test["ORIGIN_CALL"]) == True, "ORIGIN_CALL"] = 0
df_test.at[pd.isna(df_test["ORIGIN_STAND"]) == True, "ORIGIN_STAND"] = 0

# df_train.drop(labels="ORIGIN_CALL", axis=1, inplace=True)
# df_test.drop(labels="ORIGIN_CALL", axis=1, inplace=True)

### Convert ORIGIN_CALL to indices for embeddings

In [117]:
# NOTE: EMBEDDING INFO STUFF
# TODO: Convert ORIGIN_CALL to embeddings, impute missing values
# Look for caller's with 1 total taxi ride, set them all to some default index (Training set)
# Look for callers in test set not in training set -> set them to the default index
# Set nn.embeddings pad_idx to that default idx
val_cnt = df_train["ORIGIN_CALL"].value_counts().to_dict()
df_train["ORIGIN_CALL"] = df_train["ORIGIN_CALL"].map(lambda x : x  if val_cnt[x] > 1 else 0 )

multi_occurences = set(df_train["ORIGIN_CALL"].unique())
new_col = df_test["ORIGIN_CALL"]
for i, x in enumerate(new_col):
    if x not in multi_occurences:
        new_col[i] = 0
df_test["ORIGIN_CALL"] = new_col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_col[i] = 0


In [118]:
LE_ORIGIN_CALL = LabelEncoder()
LE_ORIGIN_CALL.fit(df_train["ORIGIN_CALL"])
df_train["ORIGIN_CALL"] = LE_ORIGIN_CALL.transform(df_train["ORIGIN_CALL"])

LE_ORIGIN_STAND = LabelEncoder()
LE_ORIGIN_STAND.fit(df_train["ORIGIN_STAND"])
df_train["ORIGIN_STAND"] = LE_ORIGIN_STAND.transform(df_train["ORIGIN_STAND"])


df_test["ORIGIN_CALL"] = LE_ORIGIN_CALL.transform(df_test["ORIGIN_CALL"])
df_test["ORIGIN_STAND"] = LE_ORIGIN_STAND.transform(df_test["ORIGIN_STAND"])

In [119]:
# one-hot encoding CALL_TYPE
df_train = df_train.join(pd.get_dummies(df_train["CALL_TYPE"]))
df_train.drop(labels="CALL_TYPE", axis=1, inplace=True)

df_test = df_test.join(pd.get_dummies(df_test["CALL_TYPE"]))
df_test.drop(labels="CALL_TYPE", axis=1, inplace=True)

## Assign indices to TAXI_ID for embeddings

In [120]:
# TODO: set taxi id to indices
LE_TAXI_ID = LabelEncoder()
LE_TAXI_ID.fit(df_train["TAXI_ID"])
df_train["TAXI_ID"] = LE_TAXI_ID.transform(df_train["TAXI_ID"])

df_test["TAXI_ID"] = LE_TAXI_ID.transform(df_test["TAXI_ID"])

In [121]:
df_train.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,POLYLINE,A,B,C
0,0,0,367,1372636858,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",0,0,1
1,0,7,371,1372637303,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",0,1,0
2,0,0,204,1372636951,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",0,0,1
3,0,0,330,1372636854,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",0,0,1
4,0,0,217,1372637091,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",0,0,1


## Convert `TIMESTAMP` column to quarter hours, week of year, etc.

In [122]:
# df_train["YEAR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
# df_train["WK_OF_YR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).isocalendar().week)
# df_train["WK_DAY"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday()) # WHY ARE YOU LIKE THIS??? ALL THE OTHERS ARE FIELDS
# df_train["QTR_HR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour + 0.25 * (datetime.fromtimestamp(x).minute // 15))

# df_train.drop(labels="TIMESTAMP", axis=1, inplace=True)


# df_test["YEAR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
# df_test["WK_OF_YR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).isocalendar().week)
# df_test["WK_DAY"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday()) # WHY ARE YOU LIKE THIS??? ALL THE OTHERS ARE FIELDS
# df_test["QTR_HR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour + 0.25 * (datetime.fromtimestamp(x).minute // 15))

# df_test.drop(labels="TIMESTAMP", axis=1, inplace=True)

df_train["YEAR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
df_train["MONTH"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).month)
df_train["DAY"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).day)
df_train["WEEKDAY"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday()) # WHY ARE YOU LIKE THIS??? ALL THE OTHERS ARE FIELDS
df_train["HR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
# df_train["MIN"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).minute)
df_train.drop(labels="TIMESTAMP", axis=1, inplace=True)

df_test["YEAR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
df_test["MONTH"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).month)
df_test["DAY"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).day)
df_test["WEEKDAY"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday())
df_test["HR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
# df_test["MIN"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).minute)
df_test.drop(labels="TIMESTAMP", axis=1, inplace=True)


In [123]:
df_train.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,POLYLINE,A,B,C,YEAR,MONTH,DAY,WEEKDAY,HR,MIN
0,0,0,367,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",0,0,1,2013,6,30,6,17,0
1,0,7,371,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",0,1,0,2013,6,30,6,17,8
2,0,0,204,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",0,0,1,2013,6,30,6,17,2
3,0,0,330,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",0,0,1,2013,6,30,6,17,0
4,0,0,217,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",0,0,1,2013,6,30,6,17,4


## Convert `POLYLINE` column to `TARGET` column.

In [124]:
df_train["TARGET"] = df_train["POLYLINE"].apply(lambda x : 15 * max(x.count("[") - 1, 0))
df_train.drop(labels="POLYLINE", axis=1, inplace=True)

## Get information for embeddings later

In [125]:
# print(f"CALL_TYPE: 3")
# print(f"ORIGIN_CALL: {len(df_train['ORIGIN_CALL'].unique())}")
# print(f"ORIGIN_STAND: {len(df_train['ORIGIN_STAND'].unique())}")
# print(f"TAXI_ID: {len(df_train['TAXI_ID'].unique())}")
# print(f"Unique week #: {len(df_train['WK_OF_YR'].unique())}")
# print(f"Unique qtr hrs #: {len(df_train['QTR_HR'].unique())}")

## Export

In [126]:
df_test.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,A,B,C,YEAR,MONTH,DAY,WEEKDAY,HR,MIN
0,0,15,342,0,1,0,2014,8,14,3,10,57
1,0,57,78,0,1,0,2014,8,14,3,10,50
2,0,15,239,0,1,0,2014,8,14,3,10,49
3,0,53,309,0,1,0,2014,8,14,3,10,58
4,0,18,393,0,1,0,2014,8,14,3,10,59


### Remove outliers

In [127]:
# df_train = df_train[df_train["TARGET"] > 30]
# df_train = df_train[df_train["TARGET"] < 15000]

### [0, 1] Normalization

In [128]:
df_train["YEAR"] -= 2013
df_test["YEAR"] -= 2013

In [129]:
df_train.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,A,B,C,YEAR,MONTH,DAY,WEEKDAY,HR,MIN,TARGET
0,0,0,367,0,0,1,0,6,30,6,17,0,345
1,0,7,371,0,1,0,0,6,30,6,17,8,285
2,0,0,204,0,0,1,0,6,30,6,17,2,975
3,0,0,330,0,0,1,0,6,30,6,17,0,645
4,0,0,217,0,0,1,0,6,30,6,17,4,435


#### Split training set

In [130]:
df_train.to_csv("../data/no_coord_train.csv", index=True)
# df_valid.to_csv("../data/original_validv3.csv", index=False)
df_test.to_csv("../data/no_coord.test.csv", index=False)
