In [2]:
import pandas as pd
import numpy as np

In [6]:
from keras.models import model_from_json
# load json and create model
json_file = open('model_neural_network.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model_neural_network.h5")
print("Loaded model from disk")

Using TensorFlow backend.


Loaded model from disk


In [7]:
def load_data(date):
    
    def dummie_and_drop(df, name):
        # Creates a dummy variable, concatenates it and finally drops the original categorical variable.
        # In order not to have redundant variables, one of the dummy variables is dropped too
        dummies = pd.get_dummies(df[name]).rename(columns = lambda x: name + '_' + str(x))
        dummies = dummies.drop(dummies.columns[-1], axis = 1)
        df = pd.concat([df, dummies], axis = 1)
        df.drop(columns = [name], inplace=True, axis=1)

        return df
    
    def convert_to_categorical(df, categorical_variables, categories, need_pickup = True):
        """ 
        The dataframe's selected variables are converted to categorical, and each variable's categories are also specified.
        It is also specified if the "pickup community area" has to be converted into categorical or no. If it is not 
        converted into categorical it is because it's not going to be used in the model.            
        """
        
        if need_pickup:
            begin = 0
        else:
            df.drop(columns = ['pickup_community_area'], inplace = True, axis = 1)
            begin = 1
        
        for i in range(begin, len(categorical_variables)):
            df[categorical_variables[i]] = df[categorical_variables[i]].astype('category').cat.set_categories(categories[i])
        return df
    
    
    def load(name, date, need_pickup = False, drop_correlated = False):
    
        # This parameter has to be set to True if the "pickup_community_area" variable is needed in the model
        

        # Load needed dataset and choose the useful columns
        df = pd.read_csv(name) #'dataset_train.csv')
        df = df[df['trip_start_timestamp'].str.slice(start = 0, stop = 13) == date]
        x = df[['pickup_community_area' ,'temperature', 'relative_humidity', 'wind_direction', 'wind_speed', 'precipitation_cat', 
                'sky_level', 'daytype', 'Day Name', 'Month', 'Hour', 'Fare Last Month', 'Trips Last Hour',
                'Trips Last Week (Same Hour)', 'Trips 2 Weeks Ago (Same Hour)', 'Quarter', 'Year', 'trip_start_timestamp']]

        # Convert the categorical variables
        categorical_variables = ['pickup_community_area', 'daytype', 'sky_level', 'Day Name', 'Month','Hour', 'Year']
        categories = [[*(range(1,78))], ['U', 'W', 'A'], ['OVC', 'BKN', 'SCT', 'FEW', 'CLR', 'VV '], 
                      ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                      [*(range(1,13))], [*(range(0, 24))], ['2017', '2018', '2019']]

        x = convert_to_categorical(x, categorical_variables, categories, need_pickup = need_pickup)

        
        # Make dummy variables with the categorical ones
        if need_pickup:
            begin = 0
        else:
            begin = 1
        for i in range(begin, len(categorical_variables)):
            x = dummie_and_drop(x, name = categorical_variables[i])
        
        
        
        y = df['Trips'].to_numpy()

        if need_pickup == False:
            # If we don't need the pickup, it means this is Neural Network case. Therefore we have to modify Y, in order
            # to have "n_areas" outputs per input (because there are "n_areas" regressions per input)
            x = x.groupby(by = 'trip_start_timestamp').mean()
            n_areas = 77
            y = np.reshape(y, [-1, n_areas]) # If 
        else:
            x.drop(columns = ['trip_start_timestamp'], inplace = True, axis = 1)
        
        if drop_correlated:
            x.drop(columns = ['Trips Last Week (Same Hour)'], inplace = True, axis = 1)
            x.drop(columns = ['Trips 2 Weeks Ago (Same Hour)'], inplace = True, axis = 1)

        x = x.to_numpy()
        
        return (x,y)   
    

    need_pickup = False 
    drop_correlated = False
    

    name_test = 'dataset_test.csv'
    x_test, y_test = load(name_test, date, need_pickup, drop_correlated)
    
    
    return (x_test, y_test)

In [4]:
date = '2019-11-12 14'
date_test = '2019-11-12 15' # The next hour
x_test, y_test = load_data(date)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
x_test[0].shape

(59,)

In [6]:
model.compile(optimizer = 'adam', loss = 'mse')


In [None]:
model.predict(x_test)

# Previous Disposition

In [3]:
df_taxi = pd.read_csv('may-nov19.csv')
date = '2019-11-12 14'
date_test = '2019-11-12 15'

In [4]:
# Take only the columns needed
df_taxi = df_taxi[['taxi_id', 'trip_end_timestamp','dropoff_community_area', 'trip_start_timestamp']]
# Modify Trip_end_timestamp and sort dataframe
# df_taxi['trip_end_timestamp'] = df_taxi['trip_end_timestamp'].str.slice(start= 0, stop = 19)
# df_taxi['trip_start_timestamp'] = df_taxi['trip_start_timestamp'].str.slice(start= 0, stop = 19)

df_busy = df_taxi[df_taxi['trip_start_timestamp'].str.slice(start = 0, stop = 13) == date]
df_arrive = df_taxi[(df_taxi['trip_start_timestamp'].str.slice(start = 0, stop = 16) == date_test + ':00') | (df_taxi['trip_start_timestamp'].str.slice(start = 0, stop = 16) == date_test + ':15')]
df_date = df_taxi[df_taxi['trip_end_timestamp'].str.slice(start = 0, stop = 13) == date]

df_date = df_date.sort_values(by = ['trip_end_timestamp'])


In [5]:
print('number of trips: ', df_date.shape[0])
print('number of different taxis: ', len(df_date['taxi_id'].unique()))
print('number of missing values: ', sum(df_date['dropoff_community_area'].isnull())) # Even if it is null, in a real case we would have the data
print()
print('------ Deleting missing values ------')
df_date = df_date.dropna()
print()
print('number of trips: ', df_date.shape[0])
print('number of different taxis: ', len(df_date['taxi_id'].unique()))
print('number of missing values: ', sum(df_date['dropoff_community_area'].isnull()))
print()
print('------ Deleting duplicated values ------')
print()
df_date.drop_duplicates(subset = 'taxi_id', keep = 'last', inplace = True)
print('number of trips: ', df_date.shape[0])
print('number of different taxis: ', len(df_date['taxi_id'].unique()))
print('number of missing values: ', sum(df_date['dropoff_community_area'].isnull()))

number of trips:  464
number of different taxis:  277
number of missing values:  77

------ Deleting missing values ------

number of trips:  387
number of different taxis:  228
number of missing values:  0

------ Deleting duplicated values ------

number of trips:  228
number of different taxis:  228
number of missing values:  0


In [6]:
# df_date.drop_duplicates(subset = 'taxi_id', keep = 'last', inplace = True)
df_busy.drop_duplicates(subset = 'taxi_id', keep = 'last', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
index_delete = []

for i in range(0, len(df_date)):
    taxi = df_date[['taxi_id', 'trip_end_timestamp']].iloc[i]
#     print(taxi)
    for j in range(0, len(df_busy)):
        taxi_busy = df_busy[['taxi_id', 'trip_start_timestamp']].iloc[j]
        if (str(taxi_busy['taxi_id']) == str(taxi['taxi_id']) and (str(taxi_busy['trip_start_timestamp']) > str(taxi['trip_end_timestamp']))):
            index = taxi.name
            index_delete.append(index)
            break

In [8]:
df_date.drop(index_delete, inplace = True)
df_date

Unnamed: 0,taxi_id,trip_end_timestamp,dropoff_community_area,trip_start_timestamp
622029,180309cc29892cd16348ef74d7dd6f33af87e585a29b62...,2019-11-12 14:00:00 UTC,8.0,2019-11-12 13:45:00 UTC
621788,41296974f291ee1728c72308233ea977759ee4314b08c9...,2019-11-12 14:00:00 UTC,11.0,2019-11-12 13:45:00 UTC
1410653,1d6029b251dadc15c2cfb884ba055cef1b39e07432f06f...,2019-11-12 14:00:00 UTC,8.0,2019-11-12 14:00:00 UTC
616802,8d7a84d700f39f9b122b67d40075873e4de4a5f792b95d...,2019-11-12 14:00:00 UTC,32.0,2019-11-12 14:00:00 UTC
1427139,3cde16a0a5352e50a25619d54c0c3bb6343327bc47c97a...,2019-11-12 14:00:00 UTC,8.0,2019-11-12 13:45:00 UTC
...,...,...,...,...
749153,22b403ba1c4de25cb19049e9c08b6d1339a9eea3ec06f9...,2019-11-12 14:45:00 UTC,37.0,2019-11-12 14:30:00 UTC
747330,237a0889e04b6ab9609738d8b35485bce00cf600493d4e...,2019-11-12 14:45:00 UTC,33.0,2019-11-12 14:30:00 UTC
1390355,f184219b3bc8cf9ef5f1417bc53dad6e4e6babe987f67b...,2019-11-12 14:45:00 UTC,2.0,2019-11-12 14:00:00 UTC
1432785,937aef1d267e19d4c5760039dc79f4df2c977744679d14...,2019-11-12 14:45:00 UTC,32.0,2019-11-12 14:30:00 UTC


# Dataset df_taxi clean!

## Now we have to find the taxis that end

In [9]:
df_arrive.dropna(inplace = True)
df_arrive.drop_duplicates(subset = 'taxi_id', keep = 'first', inplace = True)

df_arrive.shape
# df_date = df_date.drop_duplicates(subset = 'taxi_id', keep = 'last')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(175, 4)

In [10]:
print('Ammount of taxis not moving: ', df_date['dropoff_community_area'].shape[0])
print('Ammount of taxis arriving: ', df_arrive['dropoff_community_area'].shape[0])
df_total = pd.concat([df_date, df_arrive])
df_total.groupby(by = 'dropoff_community_area').count()

Ammount of taxis not moving:  202
Ammount of taxis arriving:  175


Unnamed: 0_level_0,taxi_id,trip_end_timestamp,trip_start_timestamp
dropoff_community_area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,3,3,3
2.0,11,11,11
3.0,7,7,7
4.0,3,3,3
5.0,4,4,4
6.0,23,23,23
7.0,16,16,16
8.0,78,78,78
9.0,1,1,1
10.0,4,4,4


# TODO
- Take out from df_date all the taxis that are actually not available. How to do this:
    - Take all the taxis that are in the df_busy dataset.
    - If there are duplicates: Check if taxi gets busy after being free for the last time
- Put taxis that arrive to the community area before the half of the next hour

In [27]:
'2019-11-12 10:45' > '2019-11-12 10:45' # It recognizes this kind of notation!

False