In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [12]:
# In order to get reproducible results

# Seed value (can actually be different for each attribution step)
seed_value= 0

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

In [13]:
def load_data():
    
    def dummie_and_drop(df, name):
        # Creates a dummy variable, concatenates it and finally drops the original categorical variable.
        # In order not to have redundant variables, one of the dummy variables is dropped too
        dummies = pd.get_dummies(df[name]).rename(columns = lambda x: name + '_' + str(x))
        dummies = dummies.drop(dummies.columns[-1], axis = 1)
        df = pd.concat([df, dummies], axis = 1)
        df.drop(columns = [name], inplace=True, axis=1)

        return df
    
    def convert_to_categorical(df, categorical_variables, categories, need_pickup = True):
        """ 
        The dataframe's selected variables are converted to categorical, and each variable's categories are also specified.
        It is also specified if the "pickup community area" has to be converted into categorical or no. If it is not 
        converted into categorical it is because it's not going to be used in the model.            
        """
        
        if need_pickup:
            begin = 0
        else:
            df.drop(columns = ['pickup_community_area'], inplace = True, axis = 1)
            begin = 1
        
        for i in range(begin, len(categorical_variables)):
            df[categorical_variables[i]] = df[categorical_variables[i]].astype('category').cat.set_categories(categories[i])
        return df
    
    
    def load(name, need_pickup = False, drop_correlated = False):
    
        # This parameter has to be set to True if the "pickup_community_area" variable is needed in the model
        

        # Load needed dataset and choose the useful columns
        df = pd.read_csv(name) #'dataset_train.csv')
        x = df[['pickup_community_area' ,'temperature', 'relative_humidity', 'wind_direction', 'wind_speed', 'precipitation_cat', 
                'sky_level', 'daytype', 'Day Name', 'Month', 'Hour', 'Fare Last Month', 'Trips Last Hour',
                'Trips Last Week (Same Hour)', 'Trips 2 Weeks Ago (Same Hour)', 'Quarter', 'Year', 'trip_start_timestamp']]

        # Convert the categorical variables
        categorical_variables = ['pickup_community_area', 'daytype', 'sky_level', 'Day Name', 'Month','Hour', 'Year']
        categories = [[*(range(1,78))], ['U', 'W', 'A'], ['OVC', 'BKN', 'SCT', 'FEW', 'CLR', 'VV '], 
                      ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                      [*(range(1,13))], [*(range(0, 24))], ['2017', '2018', '2019']]

        x = convert_to_categorical(x, categorical_variables, categories, need_pickup = need_pickup)

        
        # Make dummy variables with the categorical ones
        if need_pickup:
            begin = 0
        else:
            begin = 1
        for i in range(begin, len(categorical_variables)):
            x = dummie_and_drop(x, name = categorical_variables[i])

        y = df['Trips'].to_numpy()

        if need_pickup == False:
            # If we don't need the pickup, it means this is Neural Network case. Therefore we have to modify Y, in order
            # to have "n_areas" outputs per input (because there are "n_areas" regressions per input)
            x = x.groupby(by = 'trip_start_timestamp').mean()
            n_areas = 77
            y = np.reshape(y, [-1, n_areas]) # If 
        else:
            x.drop(columns = ['trip_start_timestamp'], inplace = True, axis = 1)
        
        if drop_correlated:
            x.drop(columns = ['Trips Last Week (Same Hour)'], inplace = True, axis = 1)
            x.drop(columns = ['Trips 2 Weeks Ago (Same Hour)'], inplace = True, axis = 1)

        x = x.to_numpy()
        
        return (x,y)   
    

    need_pickup = False 
    drop_correlated = False
    
    
    name_train = 'dataset_train.csv'
    name_test = 'dataset_test.csv'
    x, y = load(name_train, need_pickup, drop_correlated)
    x_test, y_test = load(name_test, need_pickup, drop_correlated)
    
    
    return (x, x_test, y, y_test)

In [7]:
x, x_test, y, y_test = load_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
x.shape

(1864632, 59)

In [9]:
y.shape

(24216, 77)

In [22]:
def load_data_taxi():
    
    def dummie_and_drop(df, name):
        # Creates a dummy variable, concatenates it and finally drops the original categorical variable.
        # In order not to have redundant variables, one of the dummy variables is dropped too
        dummies = pd.get_dummies(df[name]).rename(columns = lambda x: name + '_' + str(x))
        dummies = dummies.drop(dummies.columns[-1], axis = 1)
        df = pd.concat([df, dummies], axis = 1)
        df.drop(columns = [name], inplace=True, axis=1)

        return df
    
    def convert_to_categorical(df, categorical_variables, categories, need_pickup = True):
        """ 
        The dataframe's selected variables are converted to categorical, and each variable's categories are also specified.
        It is also specified if the "pickup community area" has to be converted into categorical or no. If it is not 
        converted into categorical it is because it's not going to be used in the model.            
        """
        
        if need_pickup:
            begin = 0
        else:
            df.drop(columns = ['pickup_community_area'], inplace = True, axis = 1)
            begin = 1
        
        for i in range(begin, len(categorical_variables)):
            df[categorical_variables[i]] = df[categorical_variables[i]].astype('category').cat.set_categories(categories[i])
        return df
    
    
    def load(name, need_pickup = False, drop_correlated = False):
    
        # This parameter has to be set to True if the "pickup_community_area" variable is needed in the model
        need_pickup = False 

        # Load needed dataset and choose the useful columns
        df = pd.read_csv(name) #'dataset_train.csv')
        x = df[['pickup_community_area', 'Day Name', 'Month', 'Hour', 'Fare Last Month', 'Tips Last Month', 
                'Trips Last Hour', 'Trips Last Week (Same Hour)', 'Trips 2 Weeks Ago (Same Hour)', 'Year']]

        # Convert the categorical variables
        categorical_variables = ['pickup_community_area', 'Day Name', 'Month','Hour', 'Year']
        categories = [[*(range(1,78))], ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                      [*(range(1,13))], [*(range(0, 24))], ['2017', '2018', '2019']]

        x = convert_to_categorical(x, categorical_variables, categories, need_pickup = need_pickup)

        
        # Make dummy variables with the categorical ones
        if need_pickup:
            begin = 0
        else:
            begin = 1
        for i in range(begin, len(categorical_variables)):
            x = dummie_and_drop(x, name = categorical_variables[i])

        y = df['Trips'].to_numpy()

        if need_pickup == False:
            # If we don't need the pickup, it means this is Neural Network case. Therefore we have to modify Y, in order
            # to have "n_areas" outputs per input (because there are "n_areas" regressions per input)
            n_areas = 77
            y = np.reshape(y, [-1, n_areas]) 
            
        if drop_correlated:
            x.drop(columns = ['Trips Last Week (Same Hour)'], inplace = True, axis = 1)
            x.drop(columns = ['Trips 2 Weeks Ago (Same Hour)'], inplace = True, axis = 1)

        x = x.to_numpy()
        
        return (x,y)

    need_pickup = True 
    drop_correlated = True
    
    name_train = 'dataset_train.csv'
    name_test = 'dataset_test.csv'
    x_train, y_train = load(name_train, need_pickup, drop_correlated)
    x_test, y_test = load(name_test, need_pickup, drop_correlated)
    
    
    return (x_train, x_test, y_train, y_test)

In [11]:
# x.shape

In [14]:
def plot_results(history):
    import matplotlib.pyplot as plt

    # Plot training & validation accuracy values
    plt.plot(history.history['mae'])
    plt.plot(history.history['val_mae'])
    plt.title('Model mean absolute error')
    plt.ylabel('MAE')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

In [15]:
def create_model(x, x_test, y, y_test): #n_areas, features, x_train, y_train):
    from keras.models import Sequential
    from keras.layers import Dense
    
    # In order to get reproducible results
    import tensorflow as tf
    tf.random.set_seed(2020)
    from numpy.random import seed
    seed(1)
    
    n_areas = y.shape[1]
    features = x.shape[1]
    
    act = {{choice(['relu', 'sigmoid', 'tanh'])}} # Choose the activation function
    
    model = Sequential()
    # Choose the architecture
    architecture = {{choice(['arc_0', 'arc_1', 'arc_2', 'arc_3', 'arc_4'])}}
    
    if architecture == 'arc_0':
        model.add(Dense(32, activation = act, input_shape = (features,)))
        model.add(Dense(64, activation= act))
        model.add(Dense(32, activation= act))
    
    elif architecture == 'arc_1':
        model.add(Dense(64, activation = act, input_shape = (features,)))
        model.add(Dense(128, activation= act))
        model.add(Dense(256, activation= act))
        model.add(Dense(128, activation= act))
    
    elif architecture == 'arc_2':
        model.add(Dense(128, activation = act, input_shape = (features,)))
        model.add(Dense(128, activation= act))
        model.add(Dense(256, activation= act))
        model.add(Dense(512, activation= act))
        model.add(Dense(256, activation= act))
        
    elif architecture == 'arc_3':
        model.add(Dense(128, activation = act, input_shape = (features,)))
        model.add(Dense(256, activation= act))
        model.add(Dense(512, activation= act))
        model.add(Dense(1024, activation= act))
        model.add(Dense(512, activation= act))
        model.add(Dense(256, activation= act))
        model.add(Dense(128, activation= act))
        
    elif architecture == 'arc_4':
        model.add(Dense(256, activation = act, input_shape = (features,)))
        model.add(Dense(256, activation= act))
        model.add(Dense(512, activation= act))
        model.add(Dense(512, activation= act))
        model.add(Dense(1024, activation= act))
        model.add(Dense(1024, activation= act))
        model.add(Dense(512, activation= act))
        model.add(Dense(256, activation= act))

    model.add(Dense(n_areas))
    
    
    model.compile(optimizer = {{choice(['adam', 'rmsprop' , 'sgd'])}}, loss = 'mse', metrics = ['mae'])
    model.summary()
    
    # checkpoint
#     filepath="weights-improvement-{epoch:02d}-{val_mae:.2f}.hdf5"
#     checkpoint = ModelCheckpoint(filepath, monitor='val_mae', verbose=1, save_best_only=True, mode='min')
#     callbacks_list = [checkpoint]
    
#     model.fit(X, Y, validation_split=0.33, epochs=150, batch_size=10, verbose = 0) #callbacks=callbacks_list, verbose=0)
    
    result = model.fit(x = x, y = y, validation_split = 0.15, 
                        batch_size = {{choice([32, 64, 128])}},
                        epochs = 200, verbose = 0)
    
    validation_mae = np.amin(result.history['val_mae']) 
    print('Best validation mae of epoch:', validation_mae)
    return {'loss': validation_mae, 'status': STATUS_OK, 'model': model}


In [17]:
exec('from __future__ import absolute_import, division, print_function')
import numpy as np
from hyperas import optim
from hyperas.distributions import choice, uniform
from hyperopt import Trials, STATUS_OK, tpe

best_run, best_model = optim.minimize(model = create_model,
                                      data = load_data,
                                      algo = tpe.suggest, 
                                      max_evals = 8,
                                      trials=Trials(),
                                      notebook_name = 'Model_Neural_Network')
x, x_test, y, y_test = load_data()
print("Evalutation of best performing model:")
print(best_model.evaluate(x_test, y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)

# plot_results(history)

>>> Imports:
#coding=utf-8

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    import os
except:
    pass

try:
    import random
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from keras.layers import Dense
except:
    pass

try:
    import tensorflow as tf
except:
    pass

try:
    from numpy.random import seed
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform
except:
    pass

try:
    from hyperopt import Trials, STATUS_OK, tpe
except:
    pass

try:
    from keras.models import model_from_json
except:
    pass

>>> Hyperas search space:

def get_space():
    return {
        'act': 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical_variables[i]] = df[categorical_variables[i]].astype('category').cat.set_categories(categories[i])


Model: "sequential_2"                                
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 32)                1920      
_________________________________________________________________
dense_6 (Dense)              (None, 64)                2112      
_________________________________________________________________
dense_7 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_8 (Dense)              (None, 77)                2541      
Total params: 8,653                                  
Trainable params: 8,653                              
Non-trainable params: 0                              
_________________________________________________________________
Best validation mae of epoch:                        
1.9999483823776245                                   
Model: "sequenti

Total params: 1,396,557                                                         
Trainable params: 1,396,557                                                     
Non-trainable params: 0                                                         
_________________________________________________________________               
Best validation mae of epoch:                                                   
1.9460952281951904                                                              
Model: "sequential_7"                                                           
_________________________________________________________________               
Layer (type)                 Output Shape              Param #                  
dense_36 (Dense)             (None, 128)               7680                     
_________________________________________________________________               
dense_37 (Dense)             (None, 256)               33024                    
____________________________

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Evalutation of best performing model:
[16.298441229926215, 1.2575715780258179]
Best performing model chosen hyper-parameters:
{'act': 0, 'architecture': 2, 'batch_size': 0, 'optimizer': 0}


In [18]:
np.round(best_model.predict(x[0:1])).astype('int')

array([[ 2,  2,  3,  2,  1,  9,  5, 34,  0,  0,  1,  0,  0,  1,  1,  2,
         0,  0,  0,  0,  1,  2,  0,  3,  0,  0,  0,  8,  0,  0,  0, 16,
         1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  3]])

In [19]:
y[0]

array([ 1,  0,  4,  3,  2, 33, 11, 58,  0,  0,  1,  0,  0,  4,  0,  0,  0,
        0,  0,  0,  2, 11,  0,  7,  3,  0,  0,  9,  0,  0,  1,  5,  1,  0,
        0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  7,  4], dtype=int64)

In [22]:
best_model.evaluate(x_test[0:1], y_test[0:1])



[6.302816390991211, 0.8905684351921082]

In [17]:
x_test[0:1].shape

(1, 66)

In [18]:
best_model.predict(x_test[0:1])

array([[ 2.37974119e+00,  2.24573636e+00,  3.78148508e+00,
         2.38664103e+00,  2.00838447e+00,  1.27829924e+01,
         6.88590622e+00,  5.11086655e+01, -2.03334481e-01,
         4.65536833e-01,  6.12488925e-01, -2.29467943e-01,
         3.05308819e-01,  1.66380143e+00,  6.24546885e-01,
         1.87013221e+00,  1.96223632e-01, -1.15499526e-01,
         8.94508213e-02,  1.47701412e-01,  6.10566974e-01,
         2.25085664e+00,  1.65941089e-01,  5.27327442e+00,
         3.54929894e-01,  8.10648575e-02,  1.00148916e-02,
         1.21229258e+01,  2.68596530e-01, -4.55120504e-02,
         6.19136810e-01,  2.08747215e+01,  2.68774939e+00,
         1.59481615e-01,  1.83791831e-01,  2.04227477e-01,
         6.04685321e-02,  1.46532863e-01,  1.84166983e-01,
        -3.69561613e-01,  3.91467571e-01, -2.94932425e-02,
         8.51429939e-01,  1.13504231e-01,  4.07641351e-01,
         1.56543374e-01,  5.24115264e-02,  1.81086332e-01,
         7.37116933e-02, -7.44217634e-02,  2.51282096e-0

# Save the model

In [23]:
from keras.models import model_from_json
# serialize model to JSON
model_json = best_model.to_json()
with open("model_neural_network.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
best_model.save_weights("model_neural_network.h5")
print("Saved model to disk")

Saved model to disk
