In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras import regularizers
from keras.models import model_from_json

In [None]:
#Min value of the fare_amount is less than zero and min passenger count is zero. We should discard those values.
def normalize_fare_passenger(df):
    if 'fare_amount' in df.columns:
        print("old lenght: %d" %len(df))
        df = df[df.fare_amount>0]
    print("length after fare_amount normalization: %d" %len(df))
    df = df[df.passenger_count>0]
    print("length after passenger_count normalization: %d" %len(df))
    return df

In [None]:
def convert_to_datetime(df):
    test_time = df['pickup_datetime'].astype(str).str[:-4]
    df['date_time'] =  pd.to_datetime(test_time, format='%Y%m%d %H:%M:%S')
    return df

In [None]:
def extract_date(data):
    data['hour'] = data['date_time'].dt.hour
    data['day'] = data['date_time'].dt.day
    data['month'] = data['date_time'].dt.month
    data['year'] = data['date_time'].dt.year
    data['weekday'] = data['date_time'].dt.weekday
    data = data.drop(['date_time','pickup_datetime'], axis=1)
    return data

In [None]:
# Define haversine distance
def dist(pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude):
    pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude = map(np.radians, [pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude])
    dlon = dropoff_longitude - pickup_longitude
    dlat = dropoff_latitude - pickup_latitude
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_latitude) * np.cos(dropoff_latitude) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = 6367 * c
    return distance

In [None]:
# Distances to nearby city center, airports, and other ny counties
def transform(data):
    # Distances to nearby airports, city center and other counties
    # By reporting distances to these points, the model can somewhat triangulate other locations of interest
    
    # city center
    nyc = (-74.0060, 40.7128)
    
    # county
    Nassau = (-73.5594, 40.6546)
    Suffolk = (-72.6151, 40.9849)
    Westchester = (-73.7949, 41.1220)
    Rockland = (-73.9830, 41.1489)
    Dutchess = (-73.7478, 41.7784)
    Orange = (-74.3118, 41.3912)
    Putnam = (-73.7949, 41.4351) 

    # airport
    jfk = (-73.7781, 40.6413)
    ewr = (-74.1745, 40.6895)
    lgr = (-73.8740, 40.7769)
    
    
    # county
    data['pickup_distance_to_center'] = dist(nyc[0], nyc[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_center'] = dist(nyc[0], nyc[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Nassau'] = dist(Nassau[0], Nassau[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Nassau'] = dist(Nassau[0], Nassau[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Suffolk'] = dist(Suffolk[0], Suffolk[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Suffolk'] = dist(Suffolk[0], Suffolk[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Westchester'] = dist(Westchester[0], Westchester[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Westchester'] = dist(Westchester[0], Westchester[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Rockland'] = dist(Rockland[0], Rockland[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Rockland'] = dist(Rockland[0], Rockland[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Dutchess'] = dist(Dutchess[0], Dutchess[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Dutchess'] = dist(Dutchess[0], Dutchess[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Orange'] = dist(Orange[0], Orange[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Orange'] = dist(Orange[0], Orange[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_Putnam'] = dist(Putnam[0], Putnam[1],
                                      data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_Putnam'] = dist(Putnam[0], Putnam[1],
                                      data['dropoff_longitude'], data['dropoff_latitude'])
    
    # airports
    data['pickup_distance_to_jfk'] = dist(jfk[0], jfk[1],
                                         data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_jfk'] = dist(jfk[0], jfk[1],
                                           data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_ewr'] = dist(ewr[0], ewr[1], 
                                          data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_ewr'] = dist(ewr[0], ewr[1],
                                           data['dropoff_longitude'], data['dropoff_latitude'])
    
    data['pickup_distance_to_lgr'] = dist(lgr[0], lgr[1],
                                          data['pickup_longitude'], data['pickup_latitude'])
    data['dropoff_distance_to_lgr'] = dist(lgr[0], lgr[1],
                                           data['dropoff_longitude'], data['dropoff_latitude'])
    
    # point distance
    data['distance'] = dist(data['pickup_longitude'], data['pickup_latitude'],
                            data['dropoff_longitude'], data['dropoff_latitude'])
    
    return data

In [None]:
def final_convert(df):

    # There is a 50-cent MTA State Surcharge for all trips that end in New York City or 
    # Nassau, Suffolk, Westchester, Rockland, Dutchess, Orange or Putnam Counties.
    # The following two variables can be merged into one.
    # The following only considers trips that starts in city center and ends in nearby counties,
    # while the opposite direction could also be considered
    # counties
    df['county_dropoff_1'] = np.where((df['pickup_distance_to_center'] <= 5) &
                                     ((df['dropoff_distance_to_Nassau'] <= 21.3) |
                                      (df['dropoff_distance_to_Westchester'] <= 22.4)), 1, 0)
    
    df['county_dropoff_2'] = np.where((df['pickup_distance_to_center'] <= 5) &                  
                                     ((df['dropoff_distance_to_Suffolk'] <= 48.7) |           
                                      (df['dropoff_distance_to_Rockland'] <= 14.1) |
                                      (df['dropoff_distance_to_Dutchess'] <= 28.7) |
                                      (df['dropoff_distance_to_Orange'] <= 29) |
                                      (df['dropoff_distance_to_Putnam'] <= 15.7)), 1, 0)
    
    # There is a daily 50-cent surcharge from 8pm to 6am.
    df['night_hour'] = np.where((df['hour'] >= 20) |
                                (df['hour'] <= 6) , 1, 0)
    
    # There is a $1 surcharge from 4pm to 8pm on weekdays, excluding holidays.
    df['peak_hour'] = np.where((df['hour'] >= 16) &
                                (df['hour'] <= 20) & 
                                (df['weekday'] >=0) &
                                (df['weekday'] <=4) , 1, 0)
    
    # This is a flat fare of $52 plus tolls, the 50-cent MTA State Surcharge, the 30-cent Improvement Surcharge, 
    # to/from JFK and any location in Manhattan:
    df['to_from_jfk'] = np.where(((df['pickup_distance_to_jfk'] <= 2) & (df['dropoff_distance_to_center'] <= 5)) | 
                                 ((df['pickup_distance_to_center'] <= 5) & (df['dropoff_distance_to_jfk'] <= 2)) ,1, 0)

    # There is a $4.50 rush hour surcharge (4 PM to 8 PM weekdays, excluding legal holidays). o/from JFK and any location in Manhattan:
    df['jfk_rush_hour'] = np.where((df['to_from_jfk'] == 1) & 
                                   (df['hour'] >= 16) &
                                   (df['hour'] <= 20) ,1, 0)
    
    # There is a $17.50 Newark Surcharge to Newark Airport:
    df['ewr'] = np.where((df['pickup_distance_to_center'] <= 5) &
                         (df['dropoff_distance_to_ewr'] <= 1) ,1, 0)
    
    return df

In [None]:
STD_COEFFICIENT = 3
def outlier_analysis(df):
    columns = df.columns
    for column in columns:
        std = df[column].std()
        mean = df[column].mean()
        non_outlier_df = df[(df[column] > mean - STD_COEFFICIENT*std) & (df[column] < mean + STD_COEFFICIENT*std)]
        if len(non_outlier_df) > 0:
            df = non_outlier_df
        else:
            print("column %s  : all elements outlier"%column)
        print("column %s applied"%column)
        print("new lenght: %d" %len(df))
    return df

In [None]:
def draw_histogram(df, column_name, color, edge_color):
    fig, ax = plt.subplots()
    df[column_name].hist(color=color, edgecolor=edge_color,  
                              grid=False)
    ax.set_title(column_name + ' Histogram', fontsize=12)
    ax.set_xlabel(column_name, fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)

In [None]:
def create_bin_labels(df, column, quantile_list, quantile_labels):
    quantile_list_name = column + '_bin_custom_range'
    quantile_labels_name = column + '_bin_custom_label'
    df[quantile_list_name] = pd.qcut(
                                            df[column], 
                                            q=quantile_list)
    df[quantile_labels_name] = pd.qcut(
                                                df[column], 
                                                q=quantile_list,       
                                                labels=quantile_labels)
    return df

In [None]:
def run_sequential_model(X_train, X_test, y_train, y_test, batch_size, epochs, learning_rate, verbose):
    
    input_size = X_train.shape[1]
    
    model = Sequential()
    
    model.add(Dense(256, activation='relu', input_shape=(input_size,), activity_regularizer=regularizers.l1(0.01)))
    model.add(BatchNormalization())
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(8, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1))

    adam = Adam(lr=learning_rate)
    model.compile(loss='mse', optimizer=adam, metrics=['mse'])
    
    train_history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose,  validation_data=(X_test,y_test), 
                    shuffle=True)
    
    return train_history, model

In [None]:
def save_model_and_weights(model, model_json_name, model_weight_name):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_json_name, "w") as json_file:
        json_file.write(model_json)

    # serialize weights to HDF5
    model.save_weights(model_weight_name)
    print("Saved model to disk")

In [None]:
def load_model_and_weights(model, model_json_name, model_weight_name):
    # load json and create model
    json_file = open(model_json_name, "r")
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)

    # load weights into new model
    loaded_model.load_weights(model_weight_name)
    print("Loaded model from disk")

In [None]:
def plot_build(train_history):
    # plotting train_history
    plt.figure(0)
    axes = plt.gca()
    axes.set_ylim([0,90])
    plt.plot(train_history.history['loss'],'g')
    plt.plot(train_history.history['val_loss'],'b')
    plt.rcParams['figure.figsize'] = (8, 6) 
    plt.xlabel("Num of Epochs")
    plt.ylabel("Loss")
    plt.title("Training Loss vs Validation Loss")
    plt.grid()
    plt.legend(['train','validation'])

    plt.show()

def plot_build_train_val_ratio(train_history):
    # plotting train_history
    plt.figure(0)
    axes = plt.gca()
    axes.set_ylim([0,5])
    plt.plot([x/y for x, y in zip(train_history.history['loss'], train_history.history['val_loss'])],'g')
    plt.rcParams['figure.figsize'] = (8, 6) 
    plt.xlabel("Num of Epochs")
    plt.ylabel("Loss / Val_loss")
    plt.title("Training Loss and Validation Loss Ratio")
    plt.grid()
    plt.legend(['ratio'])

    plt.show()

def plot_build_mse(train_history):
    # plotting train_history
    plt.figure(0)
    axes = plt.gca()
    axes.set_ylim([0,100])
    plt.plot(train_history.history['mean_squared_error'],'g')
    plt.plot(train_history.history['val_mean_squared_error'],'r')
    plt.rcParams['figure.figsize'] = (8, 6) 
    plt.xlabel("Num of Epochs")
    plt.ylabel("mse")
    plt.title("MSE")
    plt.grid()
    plt.legend(['loss_mse', 'val_mse'])

    plt.show()