In [1]:
import pandas as pd
from sklearn import preprocessing, metrics
import numpy as np
from datetime import datetime
import ast

In [2]:
df_train: pd.DataFrame = pd.read_csv("../data/train.csv")
df_test: pd.DataFrame = pd.read_csv("../data/test_public.csv")
df_meta: pd.DataFrame = pd.read_csv("../data/metaData_taxistandsID_name_GPSlocation.csv")

### Get starting coordinates

In [3]:
def parse_row_polyline():
    start_coords = np.zeros((len(df_train), 2))
    end_coords = np.zeros((len(df_train), 2))
    for i in range(len(df_train)):
        row = ast.literal_eval(df_train.iloc[i]["POLYLINE"])
        if len(row) == 0:
            continue
        start_coords[i] = row[0]
        end_coords[i] = row[-1]
    return start_coords, end_coords

# start_coords, end_coords = parse_row_polyline()
# np.save("start_full.npy", start_coords)
# np.save("end_full.npy", end_coords)
start_coords = np.load("../start_full.npy")
end_coords = np.load("../end_full.npy")


In [4]:
df_train["START_LONG"] = start_coords[:, 0]
df_train["START_LAT"] = start_coords[:, 1]
df_train["END_LONG"] = end_coords[:, 0]
df_train["END_LAT"] = end_coords[:, 1]

In [5]:
df_train.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,START_LONG,START_LAT,END_LONG,END_LAT
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",-8.618643,41.141412,-8.630838,41.154489
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",-8.639847,41.159826,-8.66574,41.170671
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",-8.612964,41.140359,-8.61597,41.14053
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",-8.574678,41.151951,-8.607996,41.142915
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",-8.645994,41.18049,-8.687268,41.178087


In [6]:
import matplotlib.pyplot as plt
central_long = df_train[df_train["CALL_TYPE"] == "A"]["START_LONG"].median()
central_lat = df_train[df_train["CALL_TYPE"] == "A"]["START_LAT"].median()

### Delete useless rows/Columns

In [7]:
# Remove rows with MISSING_DATA == true
df_train.drop(df_train[df_train["MISSING_DATA"] == True].index, inplace=True)
df_train.drop(labels="MISSING_DATA", axis=1, inplace=True)
# Not really necessary to remove rows for test set, do remove columns though :)
df_test.drop(labels="MISSING_DATA", axis=1, inplace=True)

# Delete column DAY_TYPE
df_train.drop(labels="DAY_TYPE", axis=1, inplace=True)
df_test.drop(labels="DAY_TYPE", axis=1, inplace=True)

# Delete column TRIP_ID
df_train.drop(labels="TRIP_ID", axis=1, inplace=True)
df_test.drop(labels="TRIP_ID", axis=1, inplace=True)


### Encode categorical

In [8]:
# one-hot encoding for call types
df_train = df_train.join(pd.get_dummies(df_train["CALL_TYPE"]))
df_train.drop(labels="CALL_TYPE", axis=1, inplace=True)

df_test = df_test.join(pd.get_dummies(df_test["CALL_TYPE"]))
df_test.drop(labels="CALL_TYPE", axis=1, inplace=True)

In [9]:
# Taxi id's -> indices
LE_TAXI_ID = preprocessing.LabelEncoder()
LE_TAXI_ID.fit(df_train["TAXI_ID"])
df_train["TAXI_ID"] = LE_TAXI_ID.transform(df_train["TAXI_ID"])

df_test["TAXI_ID"] = LE_TAXI_ID.transform(df_test["TAXI_ID"])

In [10]:
# ORIGIN_CALL's -> indices
df_train.at[pd.isna(df_train["ORIGIN_CALL"]) == True, "ORIGIN_CALL"] = 0
val_cnt = df_train["ORIGIN_CALL"].value_counts().to_dict()
df_train["ORIGIN_CALL"] = df_train["ORIGIN_CALL"].map(lambda x : x  if val_cnt[x] > 1 else 0 )

# Only give ORIGIN_CALL's that appear more than once any special treatment in indexing
multi_occurences = set(df_train["ORIGIN_CALL"].unique())
new_col = df_test["ORIGIN_CALL"]
for i, x in enumerate(new_col):
    if x not in multi_occurences:
        new_col[i] = 0

df_test["ORIGIN_CALL"] = new_col

LE_ORIGIN_CALL = preprocessing.LabelEncoder()
LE_ORIGIN_CALL.fit(df_train["ORIGIN_CALL"])
df_train["ORIGIN_CALL"] = LE_ORIGIN_CALL.transform(df_train["ORIGIN_CALL"])
df_test["ORIGIN_CALL"] = LE_ORIGIN_CALL.transform(df_test["ORIGIN_CALL"])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_col[i] = 0


In [11]:
df_train["ORIGIN_CALL"].value_counts()

0        1373970
2          57571
29025       6406
1           2499
6056        1314
          ...   
28761          2
23433          2
24686          2
656            2
28680          2
Name: ORIGIN_CALL, Length: 29027, dtype: int64

In [12]:
# Encode ORIGIN_STAND
LE_ORIGIN_STAND = preprocessing.LabelEncoder()
LE_ORIGIN_STAND.fit(df_train["ORIGIN_STAND"])
df_train["ORIGIN_STAND"] = LE_ORIGIN_STAND.transform(df_train["ORIGIN_STAND"])
df_test["ORIGIN_STAND"] = LE_ORIGIN_STAND.transform(df_test["ORIGIN_STAND"])

### Datetime

In [13]:
df_train["YEAR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
df_train["WK_OF_YR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).isocalendar().week)
df_train["WK_DAY"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday()) # WHY ARE YOU LIKE THIS??? ALL THE OTHERS ARE FIELDS
df_train["HR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
# df_train.drop(labels="TIMESTAMP", axis=1, inplace=True)


df_test["YEAR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
df_test["WK_OF_YR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).isocalendar().week)
df_test["WK_DAY"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday()) # WHY ARE YOU LIKE THIS??? ALL THE OTHERS ARE FIELDS
df_test["HR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
# df_test.drop(labels="TIMESTAMP", axis=1, inplace=True)
# TODO: use timestamps for nearest neighbors coordinate estimation in test set instead of whatever this is
# TODO: drop TIMESTAMP column later after nearest neighbor stuff

### Encode datetime

In [19]:
LE_year = preprocessing.LabelEncoder()
LE_year.fit(df_train["YEAR"])
df_train["YEAR"] = LE_year.transform(df_train["YEAR"])
df_test["YEAR"] = LE_year.transform(df_test["YEAR"])

LE_wk_of_yr = preprocessing.LabelEncoder()
LE_wk_of_yr.fit(df_train["WK_OF_YR"])
df_train["WK_OF_YR"] = LE_wk_of_yr.transform(df_train["WK_OF_YR"])
df_test["WK_OF_YR"] = LE_wk_of_yr.transform(df_test["WK_OF_YR"])

LE_wkday = preprocessing.LabelEncoder()
LE_wkday.fit(df_train["WK_DAY"])
df_train["WK_DAY"] = LE_wkday.transform(df_train["WK_DAY"])
df_test["WK_DAY"] = LE_wkday.transform(df_test["WK_DAY"])

LE_hr = preprocessing.LabelEncoder()
LE_hr.fit(df_train["HR"])
df_train["HR"] = LE_hr.transform(df_train["HR"])
df_test["HR"] = LE_hr.transform(df_test["HR"])

### Create target column

In [14]:
df_train["TARGET"] = df_train["POLYLINE"].apply(lambda x : 15 * max(x.count("[") - 1, 0))
df_train.drop(labels="POLYLINE", axis=1, inplace=True)

In [22]:
df_train.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,START_LONG,START_LAT,END_LONG,END_LAT,A,B,C,YEAR,WK_OF_YR,WK_DAY,HR,TARGET
0,0,63,367,1372636858,-8.618643,41.141412,-8.630838,41.154489,0,0,1,0,25,6,17,345
1,0,6,371,1372637303,-8.639847,41.159826,-8.66574,41.170671,0,1,0,0,25,6,17,285
2,0,63,204,1372636951,-8.612964,41.140359,-8.61597,41.14053,0,0,1,0,25,6,17,975
3,0,63,330,1372636854,-8.574678,41.151951,-8.607996,41.142915,0,0,1,0,25,6,17,645
4,0,63,217,1372637091,-8.645994,41.18049,-8.687268,41.178087,0,0,1,0,25,6,17,435


In [23]:
df_test.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,A,B,C,YEAR,WK_OF_YR,WK_DAY,HR
0,0,14,342,1408039037,0,1,0,1,32,3,10
1,0,56,78,1408038611,0,1,0,1,32,3,10
2,0,14,239,1408038568,0,1,0,1,32,3,10
3,0,52,309,1408039090,0,1,0,1,32,3,10
4,0,17,393,1408039177,0,1,0,1,32,3,10


In [25]:
df_train.to_csv("../data/embed_train.csv", index=False)
df_test.to_csv("../data/embed_test.csv", index=False)

### Continue in `finalize.ipynb`