In [None]:
import pandas as pd

from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing

import os
import import_ipynb

from kernel_functions import *

In [None]:
#Parameters
BATCH = 1024
EPOCH = 100
ROWS = 10 ** 6
LEARNING_RATE = 0.001

In [None]:
#Read data
train_df = pd.read_csv('input/train.csv', nrows = ROWS)
test_df = pd.read_csv('input/test.csv')

In [None]:
#check null value
print(test_df.isnull().sum())
#check zero value
print((test_df == 0).astype(int).sum(axis=0))
# the test data is very clean, with no null value or zero value

In [None]:
# check description
test_df.describe()
# By checking the description of test data, we can see the min and max value of each feature, 
# so we can choose the clean the train data base on these value. In other word, we can delete 
# the values that are out of these boundaries in the train data, as they are using in training 
# the model for prediction

In [None]:
#check null value 
print(train_df.isnull().sum())
#check zero value 
print((train_df == 0).astype(int).sum(axis=0))
# There are some null and zero values in the train data. This step is very import, 
# as these values can influence the training result significantly

In [None]:
# check description
train_df.describe()
# There are some values that are apparently to be wrong. E.g. the min of fare_amount is negative, but it can't be.
# The max value of passenger count is 208, which is too exagerating. We have to delete this values. But it doesn't 
# matter, we will delete the useless value base on the value boundary in the test data.

In [None]:
# Delete null value
print("old: %d" %len(train_df))
train_df = train_df.dropna(how = 'any', axis = 'rows')
print("new: %d" %len(train_df)) # track data amount before and after deletion

# Delete zero value
print("old: %d" %len(train_df))
train_df = train_df[~(train_df == 0).any(axis=1)]
print("new: %d" %len(train_df)) # track data amount before and after deletion

In [None]:
#Min value of the fare_amount is less than zero and min passenger count is zero. We should discard those values.
train_df = normalize_fare_passenger(train_df)

In [None]:
# Strip the 'pickup_datetime' column

# Apply to both train and test data   
train_df = convert_to_datetime(train_df)
test_df = convert_to_datetime(test_df)

# Check shape
print (test_df.shape)
print (train_df.shape)

In [None]:
# Extract date attributes and then drop the pickup_datetime column

# Apply to both train and test data      
train_df = extract_date(train_df)
test_df = extract_date(test_df)

# Check shape
print (test_df.shape)
print (train_df.shape)

In [None]:
# There are extra charges if trip ends in 3 nearby aiports and 7 nearby counties from the NYC center,
# so these location points and there distances to pickup and dropoff points are key factors 

# Apply to both train and test data      
train_df = transform(train_df)
test_df = transform(test_df)

# Check shape
print (test_df.shape)
print (train_df.shape)

In [None]:
# Consider extra charges

# Apply to both train and test data      
train_df = final_convert(train_df)
test_df = final_convert(test_df)

# Check shape
print (test_df.shape)
print (train_df.shape)

In [None]:
#drop key
train_df.drop(['key'], axis=1, inplace=True)

In [None]:
#OUTLIER DETECTION (Mean-Std)
 
print("old lenght: %d" %len(train_df))
train_df = outlier_analysis(train_df)

In [None]:
#selected after outlier detection, that columns returns 0 row after analysis
train_df.drop(['county_dropoff_1', 'county_dropoff_2', 'night_hour', 'peak_hour', 'to_from_jfk', 'jfk_rush_hour', 'ewr'], axis=1, inplace=True)

In [None]:
#fare_amount histogram
draw_histogram(train_df, 'fare_amount', color='#A9C5D3', edge_color='black')

In [None]:
quantile_list = [0, .25, .5, .75, 1.]
quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q']

In [None]:
#Adaptive Binning can be used
train_df = create_bin_labels(train_df, 'fare_amount', quantile_list, quantile_labels)
train_df[['fare_amount', 'fare_amount_bin_custom_range', 'fare_amount_bin_custom_label']].head()

In [None]:
#FEATURE SELECTION
fare_drop_df = train_df.drop(['fare_amount', 'fare_amount_bin_custom_range', 'fare_amount_bin_custom_label'], axis=1)

model = ExtraTreesClassifier()
model.fit(fare_drop_df[:10000], train_df['fare_amount_bin_custom_label'][:10000])

for i, j in zip(fare_drop_df.columns, model.feature_importances_):
    print("%s -> %s"%(i, round(j, 5)))

In [None]:
train_df['fare_amount_bin_custom_label']

In [None]:
#Select important features
train_df = train_df[['fare_amount', 'pickup_longitude', 'dropoff_distance_to_center', 'pickup_distance_to_Suffolk', 'dropoff_distance_to_Dutchess', 'pickup_distance_to_lgr', 'dropoff_distance_to_lgr']]
test_df = test_df[['key','pickup_longitude', 'dropoff_distance_to_center', 'pickup_distance_to_Suffolk', 'dropoff_distance_to_Dutchess', 'pickup_distance_to_lgr', 'dropoff_distance_to_lgr']]

In [None]:
#Split data
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(train_df.drop('fare_amount', axis=1),
                                                    train_df['fare_amount'], test_size=0.2)

# Check shape
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
train_history, model = run_sequential_model(X_train, X_test, y_train, y_test, BATCH, EPOCH, LEARNING_RATE, verbose=2)

In [None]:
#SAVE MODEL AND WEIGHTS
save_model_and_weights(model, "model.json", "model.h5")

In [None]:
#LOAD MODEL AND WEIGHTS
load_model_and_weights(model, "model.json", "model.h5")

In [None]:
plot_build(train_history)

In [None]:
plot_build_train_val_ratio(train_history)

In [None]:
plot_build_mse(train_history)

In [None]:
# Generating DNN submission
pred_y = model.predict([test_df.drop(['key'], axis=1)])
test_df['pred'] = pred_y

submission = pd.DataFrame(
    {'key': test_df.key, 'fare_amount': test_df.pred},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission_dnn.csv', index = False)

print(os.listdir('.'))