## Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from datetime import datetime


## Load `train.csv`

In [57]:
df_train: pd.DataFrame = pd.read_csv("data/train.csv")

df_test: pd.DataFrame = pd.read_csv("data/test_public.csv")

In [62]:
df_train.head()
df_train["TIMESTAMP"] == None

0          False
1          False
2          False
3          False
4          False
           ...  
1710665    False
1710666    False
1710667    False
1710668    False
1710669    False
Name: TIMESTAMP, Length: 1710670, dtype: bool

### Add coordinates

In [5]:
# start_coords = np.load("./data_analysis/start_full.npy")
# start_test = np.load("./data_analysis/start_test.npy")
# df_train["LONG"] = start_coords[:, 0]
# df_train["LAT"] = start_coords[:, 1]
#
# df_test["LONG"] = start_test[:, 0]
# df_test["LAT"] = start_test[:, 1]



## Delete rows where `MISSING_DATA` is true and delete column `MISSING_DATA`

In [63]:
df_train.drop(df_train[df_train["MISSING_DATA"] == True].index, inplace=True)
df_train.drop(labels="MISSING_DATA", axis=1, inplace=True)

df_test.drop(df_test[df_test["MISSING_DATA"] == True].index, inplace=True)
df_test.drop(labels="MISSING_DATA", axis=1, inplace=True)

## Delete column `DAY_TYPE`

In [64]:
df_train.drop(labels="DAY_TYPE", axis=1, inplace=True)

df_test.drop(labels="DAY_TYPE", axis=1, inplace=True)

## Delete column `TRIP_ID`

In [65]:
df_train.drop(labels="TRIP_ID", axis=1, inplace=True)

df_test.drop(labels="TRIP_ID", axis=1, inplace=True)

## Set `ORIGIN_STAND` and `ORIGIN_CALL` to 0 for entries with null `ORIGIN_STAND`

In [66]:
# df_train.at[pd.isna(df_train["ORIGIN_CALL"]) == True, "ORIGIN_CALL"] = 0
# df_train.at[pd.isna(df_train["ORIGIN_STAND"]) == True, "ORIGIN_STAND"] = 0
#
# df_test.at[pd.isna(df_test["ORIGIN_CALL"]) == True, "ORIGIN_CALL"] = 0
# df_test.at[pd.isna(df_test["ORIGIN_STAND"]) == True, "ORIGIN_STAND"] = 0

df_train.drop(labels="ORIGIN_CALL", axis=1, inplace=True)
df_test.drop(labels="ORIGIN_CALL", axis=1, inplace=True)

df_train.drop(labels="ORIGIN_STAND", axis=1, inplace=True)
df_test.drop(labels="ORIGIN_STAND", axis=1, inplace=True)

### Convert ORIGIN_CALL to indices for embeddings

In [10]:
# NOTE: EMBEDDING INFO STUFF
# TODO: Convert ORIGIN_CALL to embeddings, impute missing values
# Look for caller's with 1 total taxi ride, set them all to some default index (Training set)
# Look for callers in test set not in training set -> set them to the default index
# Set nn.embeddings pad_idx to that default idx
# val_cnt = df_train["ORIGIN_CALL"].value_counts().to_dict()
# df_train["ORIGIN_CALL"] = df_train["ORIGIN_CALL"].map(lambda x : x  if val_cnt[x] > 1 else 0 )

In [11]:
df_train["ORIGIN_CALL"].value_counts()

0.0        1373970
2002.0       57571
63882.0       6406
2001.0        2499
13168.0       1314
            ...   
63189.0          2
50313.0          2
53254.0          2
3675.0           2
62998.0          2
Name: ORIGIN_CALL, Length: 29027, dtype: int64

In [12]:
# multi_occurrences = set(df_train["ORIGIN_CALL"].unique())
# new_col = df_test["ORIGIN_CALL"]
# for i, x in enumerate(new_col):
#     if x not in multi_occurrences:
#         new_col[i] = 0
# df_test["ORIGIN_CALL"] = new_col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_col[i] = 0


In [13]:
# LE_ORIGIN_CALL = LabelEncoder()
# LE_ORIGIN_CALL.fit(df_train["ORIGIN_CALL"])
# df_train["ORIGIN_CALL"] = LE_ORIGIN_CALL.transform(df_train["ORIGIN_CALL"])
#
# LE_ORIGIN_STAND = LabelEncoder()
# LE_ORIGIN_STAND.fit(df_train["ORIGIN_STAND"])
# df_train["ORIGIN_STAND"] = LE_ORIGIN_STAND.transform(df_train["ORIGIN_STAND"])
#
#
# df_test["ORIGIN_CALL"] = LE_ORIGIN_CALL.transform(df_test["ORIGIN_CALL"])
# df_test["ORIGIN_STAND"] = LE_ORIGIN_STAND.transform(df_test["ORIGIN_STAND"])

## One-hot encoding for `CALL_TYPE`

In [89]:
# one-hot encoding
df_train = df_train.join(pd.get_dummies(df_train["CALL_TYPE"]))
df_train.drop(labels="CALL_TYPE", axis=1, inplace=True)

df_test = df_test.join(pd.get_dummies(df_test["CALL_TYPE"]))
df_test.drop(labels="CALL_TYPE", axis=1, inplace=True)

## Assign indices to TAXI_ID for embeddings

In [15]:
# TODO: set taxi id to indices
LE_TAXI_ID = LabelEncoder()
LE_TAXI_ID.fit(df_train["TAXI_ID"])
df_train["TAXI_ID"] = LE_TAXI_ID.transform(df_train["TAXI_ID"])

df_test["TAXI_ID"] = LE_TAXI_ID.transform(df_test["TAXI_ID"])

In [67]:
df_train.drop(labels="TAXI_ID", axis=1, inplace=True)
df_test.drop(labels="TAXI_ID", axis=1, inplace=True)

In [68]:
df_train.head()

Unnamed: 0,CALL_TYPE,TIMESTAMP,POLYLINE
0,C,1372636858,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,B,1372637303,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,C,1372636951,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,C,1372636854,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,C,1372637091,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [73]:
len(df_train["POLYLINE"])

1710660

In [17]:
df_test.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,LONG,LAT,A,B,C
0,0,15,342,1408039037,-8.585876,41.148628,0,1,0
1,0,57,78,1408038611,-8.610707,41.145718,0,1,0
2,0,15,239,1408038568,-8.585876,41.148628,0,1,0
3,0,53,309,1408039090,-8.614013,41.141209,0,1,0
4,0,18,393,1408039177,-8.619603,41.148319,0,1,0


## Convert `TIMESTAMP` column to quarter hours, week of year, etc.

In [74]:
df_train["YEAR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
df_train["MONTH"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).month)
df_train["DAY"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).day)
df_train["WEEKDAY"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday()) # WHY ARE YOU LIKE THIS??? ALL THE OTHERS ARE FIELDS
df_train["HR"] = df_train["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
df_train.drop(labels="TIMESTAMP", axis=1, inplace=True)

df_test["YEAR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).year)
df_test["MONTH"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).month)
df_test["DAY"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).day)
df_test["WEEKDAY"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).weekday())
df_test["HR"] = df_test["TIMESTAMP"].apply(lambda x: datetime.fromtimestamp(x).hour)
df_test.drop(labels="TIMESTAMP", axis=1, inplace=True)

In [None]:
LE_WK_OF_YR = LabelEncoder()
LE_WK_OF_YR.fit(df_train["WK_OF_YR"])
df_train["WK_OF_YR"] = LE_WK_OF_YR.transform(df_train["WK_OF_YR"])

LE_WK_DAY = LabelEncoder()
LE_WK_DAY.fit(df_train["WK_DAY"])
df_train["WK_DAY"] = LE_WK_DAY.transform(df_train["WK_DAY"])

LE_QTR_HR = LabelEncoder()
LE_QTR_HR.fit(df_train["QTR_HR"])
df_train["QTR_HR"] = LE_QTR_HR.transform(df_train["QTR_HR"])

In [90]:
df_test.head()

Unnamed: 0,YEAR_2013,YEAR_2014,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,...,HR_17,HR_18,HR_19,HR_20,HR_21,HR_22,HR_23,A,B,C
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0


In [37]:
encoder = OneHotEncoder()
encoder.fit(df_train[["YEAR", "MONTH", "DAY", "WEEKDAY", "HR"]])

In [76]:
transformed = encoder.transform(df_train[["YEAR", "MONTH", "DAY", "WEEKDAY", "HR"]].to_numpy())
#Create a Pandas DataFrame of the hot encoded column
ohe_df = pd.DataFrame.sparse.from_spmatrix(transformed)
ohe_df.columns = encoder.get_feature_names_out()
#concat with original data
df_train = pd.concat([df_train, ohe_df], axis=1).drop(["YEAR", "MONTH", "DAY", "WEEKDAY", "HR"], axis=1)

transformed = encoder.transform(df_test[["YEAR", "MONTH", "DAY", "WEEKDAY", "HR"]].to_numpy())
#Create a Pandas DataFrame of the hot encoded column
ohe_df = pd.DataFrame.sparse.from_spmatrix(transformed)
ohe_df.columns = encoder.get_feature_names_out()
#concat with original data
df_test = pd.concat([df_test, ohe_df], axis=1).drop(["YEAR", "MONTH", "DAY", "WEEKDAY", "HR"], axis=1)



## Convert `POLYLINE` column to `TARGET` column.

In [88]:
df_train.head()

Unnamed: 0,CALL_TYPE,POLYLINE,YEAR_2013,YEAR_2014,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,...,HR_14,HR_15,HR_16,HR_17,HR_18,HR_19,HR_20,HR_21,HR_22,HR_23
0,C,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
df_train.dropna(inplace=True)

In [91]:
df_train["TARGET"] = df_train["POLYLINE"].apply(lambda element : 15 * max(element.count("[") - 1, 0))
df_train.drop(labels="POLYLINE", axis=1, inplace=True)

# wtf am i doing
# df_test["TARGET"] = df_test["POLYLINE"].apply(lambda x : 15 * max(x.count("[") - 1, 0))
# df_test.drop(labels="POLYLINE", axis=1, inplace=True)

## Get information for embeddings later

In [None]:
# print(f"CALL_TYPE: 3")
# print(f"ORIGIN_CALL: {len(df_train['ORIGIN_CALL'].unique())}")
# print(f"ORIGIN_STAND: {len(df_train['ORIGIN_STAND'].unique())}")
# print(f"TAXI_ID: {len(df_train['TAXI_ID'].unique())}")
# print(f"Unique week #: {len(df_train['WK_OF_YR'].unique())}")
# print(f"Unique qtr hrs #: {len(df_train['QTR_HR'].unique())}")

In [20]:
df_train["ORIGIN_CALL"].unique()

array([    0, 15138, 15935, ..., 20172,  2514, 28680], dtype=int64)

## Export

In [93]:
df_train.head()

Unnamed: 0,YEAR_2013,YEAR_2014,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,...,HR_18,HR_19,HR_20,HR_21,HR_22,HR_23,A,B,C,TARGET
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,345
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,285
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,975
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,645
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,435


### Remove outliers

In [94]:
df_train = df_train[df_train["TARGET"] > 30]
df_train = df_train[df_train["TARGET"] < 20000]

### [0, 1] Normalization

In [24]:
# from sklearn.preprocessing import normalize
# df_train["YEAR"] -= 2013
# df_test["YEAR"] -= 2013
# # df_train["YEAR"] = normalize(df_train["YEAR"])

# # TODO: do in ensemble.ipynb

#### Split training set

In [95]:
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_valid = df_train[int(0.9 * len(df_train)):]
df_train = df_train[:int(0.9 * len(df_train))]
df_train.to_csv("data/simple_train.csv", index=False)
df_valid.to_csv("data/simple_valid.csv", index=False)
df_test.to_csv("data/simple_test.csv", index=False)
