In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import math

from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.optimizers import Adadelta
from keras import regularizers

from sklearn.preprocessing import PolynomialFeatures

In [None]:
train_df = pd.read_csv('input/train.csv', nrows = 10 ** 6)
test_df = pd.read_csv('input/test.csv')

In [None]:
#Remove null rows
print('Old size: %d' % len(train_df))
train_df = train_df.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train_df))

In [None]:
#Min value of the fare_amount is less than zero and min passenger count is zero. We should discard those values.
def normalize_fare_passenger(df):
    if 'fare_amount' in df.columns:
        print("old lenght: %d" %len(df))
        df = df[df.fare_amount>0]
    print("length after fare_amount normalization: %d" %len(df))
    df = df[df.passenger_count>0]
    print("length after passenger_count normalization: %d" %len(df))
    return df

train_df = normalize_fare_passenger(train_df)

In [None]:
# Calc haversine distance
def calc_haversine(df):
    R = 6371
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

    df['dlat'] = np.radians(df.dropoff_latitude - df.pickup_latitude)
    df['dlon'] = np.radians(df.dropoff_longitude - df.pickup_longitude)
    df['haversine_a'] = np.sin(df.dlat/2) * np.sin(df.dlat/2) + np.cos(np.radians(df.pickup_latitude)) \
            * np.cos(np.radians(df.dropoff_latitude)) * np.sin(df.dlon/2) * np.sin(df.dlon/2)
    df['haversine'] = R * 2 * np.arctan2(np.sqrt(df.haversine_a), np.sqrt(1-df.haversine_a))

    return df.drop(columns=['pickup_datetime'])

train_df = calc_haversine(train_df)
test_df = calc_haversine(test_df)

In [None]:
# filter interesting columns and label
train_y = np.array(train_df['fare_amount'])
train_X = train_df.drop(columns=['fare_amount','key'])

print("Shape for X:")
print(train_X.shape)
print("Shape for Y:")
print(train_y.shape)

test_X = test_df.drop(columns=['key'])
print("Shape for test X:")
print(test_X.shape)

In [None]:
def run_model(X, Y, dnn_layers_size, dropout_value, batch_size, epochs):
    
    input_size = X.shape[1]
    
    model = Sequential()
    
    for idx, l in enumerate(dnn_layers_size):
        model.add(Dense(l, input_dim=input_size,
                           kernel_initializer='normal',
                           activation='selu'))
        model.add(Dropout(dropout_value))
        input_size = l
        
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    train_history = model.fit([X], Y, epochs=epochs, batch_size=batch_size, validation_split=0.1, shuffle=True)
    
    return train_history, model

def build_layers(layers, n_features):
    if len(layers) == 0:
        n_features = int(n_features * 2.5)
    else:
        n_features = int(math.sqrt(n_features))
        
    if n_features < 3:
        return layers
    else:
        layers.append(n_features)
        return build_layers(layers, n_features)
    
def plot_build(train_history):    
    
    # plotting train_history
    plt.figure(0)
    axes = plt.gca()
    axes.set_ylim([0,90])
    plt.plot(train_history.history['loss'],'g')
    plt.plot(train_history.history['val_loss'],'b')
    plt.rcParams['figure.figsize'] = (8, 6) 
    plt.xlabel("Num of Epochs")
    plt.ylabel("Loss")
    plt.title("Training Loss vs Validation Loss")
    plt.grid()
    plt.legend(['train','validation'])

    plt.show()

def plot_build_train_val_ratio(train_history):    
    
    # plotting train_history
    plt.figure(0)
    axes = plt.gca()
    axes.set_ylim([0,5])
    plt.plot([x/y for x, y in zip(train_history.history['loss'], train_history.history['val_loss'])],'g')
    plt.rcParams['figure.figsize'] = (8, 6) 
    plt.xlabel("Num of Epochs")
    plt.ylabel("Loss / Val_loss")
    plt.title("Training Loss and Validation Loss Ratio")
    plt.grid()
    plt.legend(['ratio'])

    plt.show()

In [None]:
layers = build_layers([],train_X.shape[1])
print('Layers:', layers)
print('-' * 15)

#train_X.drop(columns=['passenger_count'], axis=1, inplace=True)

train_history, model = run_model(train_X, train_y, layers, 0.2, batch_size = 32, epochs = 1000)

In [None]:
plot_build(train_history)

In [None]:
plot_build_train_val_ratio(train_history)

In [None]:
# Generating DNN submission
pred_y = model.predict([test_X])
test_df['pred'] = pred_y

submission = pd.DataFrame(
    {'key': test_df.key, 'fare_amount': test_df.pred},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission_dnn.csv', index = False)

print(os.listdir('.'))