In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from geopy import distance
from sklearn.preprocessing import LabelEncoder


In [None]:
raw_train = pd.read_csv("kaggle_data/train.csv")
print(f'Size before removing missing data: {len(raw_train)}')

In [None]:
# Remove missing data and duplicates
my_dataset = raw_train[raw_train['MISSING_DATA'] == False]
my_dataset = my_dataset[my_dataset['POLYLINE'] != '[]']

progress_bar = tqdm(total=len(my_dataset))
def polylength(polyline):
    progress_bar.update(1)
    poly = eval(polyline["POLYLINE"])

    if len(poly) == 1:
        return -1.0, True, -1.0, -1.0, -1.0, -1.0
    
    # KM/HR
    def speed(x):
        coord1, coord2 = x
        return distance.geodesic((coord1[1], coord1[0]), (coord2[1], coord2[0])).km
    distance_list = list(map(speed, zip(poly, poly[1:])))
    speed_list = np.array(distance_list, dtype=float)*240
        
    # Travel Time & Speed Exceeds 200 KM/HR
    return (len(poly)-1)*15, (max(speed_list) >= 200), poly[0], poly[-1], sum(distance_list), np.average(speed_list)

my_dataset[["TRAVEL_TIME", "IRREGULAR", "START", "END", "LENGTH", "AVG_SPEED"]] = my_dataset[["POLYLINE"]].apply(polylength, axis=1, result_type="expand")
progress_bar.close()
my_dataset = my_dataset.drop('POLYLINE', axis=1)

my_dataset = my_dataset.reset_index()
my_dataset = my_dataset.drop(['MISSING_DATA', 'index'], axis=1)

In [None]:
#my_dataset.to_csv('gps_train.csv', index=False)

In [None]:
my_dataset = pd.read_csv('gps_train.csv')

In [None]:
my_dataset = my_dataset[my_dataset['IRREGULAR'] != True]
my_dataset = my_dataset.drop('IRREGULAR', axis=1)

In [None]:
print(f'Size after removing missing data: {len(my_dataset)}')

In [None]:
from datetime import datetime, timedelta
import holidays

portugal_holidays = holidays.country_holidays('PT', subdiv='Ext')

def parse_time(x):
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  dt_tuple = dt.timetuple()
  holiday = 0
  if dt.date() in portugal_holidays:
    holiday = 2
  elif dt.date() + timedelta(days=1) in portugal_holidays:
    holiday = 1
  return (dt.hour*4 + dt.minute//15), dt.weekday(), dt.date().isocalendar().week - 1, holiday

my_dataset[["QTRHR", "WK", "WKYR", "HOLIDAY"]] = my_dataset[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
my_dataset = my_dataset.drop('DAY_TYPE', axis=1)
my_dataset = my_dataset.drop('TIMESTAMP', axis=1)

my_dataset['ORIGIN_CALL'] = my_dataset['ORIGIN_CALL'].fillna(0)
my_dataset['ORIGIN_STAND'] = my_dataset['ORIGIN_STAND'].fillna(0)

In [None]:
my_dataset

In [None]:
label_encoder = LabelEncoder()

In [None]:
OC = sorted(my_dataset['ORIGIN_CALL'].astype(int).unique())
label_encoder.fit(OC)
my_dataset['ORIGIN_CALL'] = label_encoder.transform(my_dataset['ORIGIN_CALL'].astype(int))

In [None]:
label_encoder.fit(range(64))
my_dataset['ORIGIN_STAND'] = label_encoder.transform(my_dataset['ORIGIN_STAND'].astype(int))

In [None]:
TI = sorted(my_dataset['TAXI_ID'].astype(int).unique())
TI.insert(0, 0)

label_encoder.fit(TI)
my_dataset['TAXI_ID'] = label_encoder.transform(my_dataset['TAXI_ID'].astype(int))

In [None]:
my_dataset

In [None]:
my_dataset.to_csv('processed_train.csv', index=False)

In [None]:
test_df = pd.read_csv('kaggle_data/test_public.csv')

In [None]:
test_df[["QTRHR", "WK", "WKYR", "HOLIDAY"]] = test_df[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
test_df = test_df.drop(['MISSING_DATA', 'DAY_TYPE', 'TIMESTAMP'], axis=1)
test_df['ORIGIN_CALL'] = test_df['ORIGIN_CALL'].fillna(0)
test_df['ORIGIN_STAND'] = test_df['ORIGIN_STAND'].fillna(0)

In [None]:
test_df

In [None]:
label_encoder.fit(OC)
test_df['ORIGIN_CALL'] = test_df['ORIGIN_CALL'].apply(lambda x: 0 if x not in label_encoder.classes_ else x)
test_df['ORIGIN_CALL'] = label_encoder.transform(test_df['ORIGIN_CALL'].astype(int))

In [None]:
label_encoder.fit(range(64))
test_df['ORIGIN_STAND'] = label_encoder.transform(test_df['ORIGIN_STAND'].astype(int))

In [None]:
label_encoder.fit(TI)
test_df['TAXI_ID'] = test_df['TAXI_ID'].apply(lambda x: 0 if x not in label_encoder.classes_ else x)
test_df['TAXI_ID'] = label_encoder.transform(test_df['TAXI_ID'].astype(int))

In [None]:
test_df

In [None]:
test_df.to_csv('processed_test.csv', index=False)