### Weili Cao Kaggle

In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import time
from tensorflow.keras.callbacks import Callback

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder


In [None]:
df_tr = pd.read_csv("train.csv")

# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)
df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

from datetime import datetime
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
df_tr[["YR", "MON", "DAY", "HR", "WK"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

def mean_absolute_error(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)  # Convert y_true to float32
    return tf.reduce_mean(tf.abs(y_pred - y_true))

class EpochEndCallback(Callback):
    def on_epoch_begin(self, epoch, logs=None):
        self.start_time = time.time()

    def on_epoch_end(self, epoch, logs=None):
        elapsed_time = time.time() - self.start_time
        loss = logs.get('loss')
        print(f'Epoch {epoch + 1}: loss = {loss:.4f}, time = {elapsed_time:.2f} seconds')
        
def root_mean_squared_error(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)  # Convert y_true to float32
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))


In [None]:
df = df_tr.copy()
df['ORIGIN_CALL'].fillna(0, inplace=True)
df['ORIGIN_STAND'].fillna(0, inplace=True)
df = df[df['MISSING_DATA'] != True]
df['YR'] = df['YR'].astype(str)
df['MON'] = df['MON'].astype(str)
df['DAY'] = df['DAY'].astype(str)
df['HR'] = df['HR'].astype(str)
df['WK'] = df['WK'].astype(str)
df['ORIGIN_CALL'] = df['ORIGIN_CALL'].astype(str)
df['ORIGIN_STAND'] = df['ORIGIN_STAND'].astype(str)
string_features = ['CALL_TYPE', 'ORIGIN_STAND', 'DAY_TYPE', 'YR', 'MON', 'DAY', 'HR', 'WK'] 
preprocessor = ColumnTransformer(
    transformers=[
        ('string', OneHotEncoder(sparse_output=False), string_features)
    ])
X = df.drop(columns=['TRIP_ID','ORIGIN_CALL', 'TAXI_ID', 'TIMESTAMP', 'MISSING_DATA','POLYLINE','LEN'])
y = df['LEN']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(X_train_preprocessed.shape[1],)))
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), 
              loss=root_mean_squared_error, metrics=[root_mean_squared_error])

In [None]:
epoch_end_callback = EpochEndCallback()
model.fit(X_train_preprocessed, y_train, epochs=5, batch_size=32, verbose=0, callbacks=[epoch_end_callback])

In [None]:
predictions = model.predict(X_test_preprocessed)
test_loss, test_rmse = model.evaluate(X_test_preprocessed, y_test)
print(f"Test loss: {test_loss}, Test RMSE: {test_rmse}")

In [None]:
df_test = pd.read_csv("test_public.csv")
df_test[["YR", "MON", "DAY", "HR", "WK"]] = df_test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
df_test['ORIGIN_CALL'].fillna(0, inplace=True)
df_test['ORIGIN_STAND'].fillna(0, inplace=True)
df_test = df_test[df_test['MISSING_DATA'] != True]

df_test['ORIGIN_CALL'] = df_test['ORIGIN_CALL'].astype(str)
df_test['ORIGIN_STAND'] = df_test['ORIGIN_STAND'].astype(str)
string_features = ['CALL_TYPE', 'ORIGIN_STAND', 'DAY_TYPE', 'YR', 'MON', 'DAY', 'HR', 'WK'] 
preprocessor = ColumnTransformer(
    transformers=[
        ('string', OneHotEncoder(sparse_output=False), string_features)
    ])
X_test = df_test.drop(columns=['TRIP_ID','ORIGIN_CALL', 'TAXI_ID', 'TIMESTAMP', 'MISSING_DATA'])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
predictions = model.predict(X_test_preprocessed)

# Sample submission file that is given on kaggle
df_sample = pd.read_csv("sampleSubmission.csv")

df_sample["TRAVEL_TIME"] = predictions

# mean(716.43) -> 792.73593
# median(600) -> 784.74219
df_sample.to_csv("my_pred.csv", index=None)