#### Read CSV file into DataFrame and extract time components

In [1]:
def readCSV(infile): 
    """
    Description: 
        1. Reads the input CSV file into a Pandas DataFrame. 
        2. Unifies Date and time formats and extracts day of the week for each observation.
    Arguments:
        infile: String value of input CSV file name.
    Returns:
        dataframe: Pandas DataFrame of observation with three extracted columns: NEW_DATE, NEW_TIME & DAY_WEEK.
    """
    
    import pandas as pd
    
    dataframe = pd.read_csv(infile, dtype=object, error_bad_lines=False, warn_bad_lines=False, low_memory=False)
    
    dataframe['NEW_DATE'] = pd.Series(pd.DatetimeIndex(dataframe['ACC_DATE']).date)
    dataframe['NEW_TIME'] = pd.Series(pd.DatetimeIndex(dataframe['ACC_TIME']).hour)
    dataframe['DAY_WEEK'] = pd.Series(pd.DatetimeIndex(dataframe['ACC_DATE']).dayofweek) #Monday=0, Sunday=6
    
    return dataframe

#### Generate accidents count data set by grid cell

In [2]:
def countData(dataframe): 
    """
    Description: 
        Generates an array of cell id, time of the day (0-23) and count of accidents per cell.
    Arguments:
        dataframe: Input Pandas DataFrame of observations.
    Returns:
        arr: NumPy array with 3 columns of cell id, time of the day & count (of accidents per cell).
    """
    
    import numpy as np 
    from collections import defaultdict
    
    dd = defaultdict(int)
    
    # generate dictionary of counts per cell
    for row in df[['id', 'NEW_TIME']].itertuples():
        for hour in np.arange(24):
            dd.setdefault((row[1], hour), 0)
    
        dd[(row[1], row[2])] += 1
    
    # convert dictionary to array 
    arr = np.ndarray([len(dd),3], dtype=object)
    idx = 0
    
    for k, v in dd.items():
        arr[idx, 0], arr[idx, 1] = k
        arr[idx, 2] = v
        idx += 1 
    
    return arr

#### Build time series and prepare dataset for supervised learning

In [3]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Description: 
        Frames a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    
    from pandas import DataFrame
    from pandas import concat
    
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

#### Build a DataFrame of observations in each grid cell using Rolling Window method

In [4]:
def walkForwardValidate(dataframe): 
    """
    Description: 
        Generates training and test data sets from time series data set. 
    Arguments:
        dataframe: Pandas DataFrame of observations.
    Returns:
        model: A dictionary containting estimator object (RF), RMSE and size of training dataset.
    """
    
    import numpy as np 
    from math import sqrt
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error, mean_absolute_error
    
    # Create 3 splits in time series data set
    n_records = dataframe.shape[0]
    n_train = int(n_records / 3)
    
    # initialize 
    min_rmse = 1000
    model = {}
    
    for i in range(n_train, n_records):
        train, test = dataframe.iloc[0:i,:].as_matrix(), dataframe.iloc[i:i+1,:].as_matrix()
        X_train, X_test = train[:,:-1], test[:,:-1]
        y_train, y_test = train[:,-1], test[:,-1]
        
        # tune hyperparameters
        for n_estimators in np.arange(10,21): # changed upper limit from 31 to 21
            for max_depth in np.arange(1,5):  # changed upper limit from 7 to 5
                
                # train Random Forest models
                estimator = RandomForestRegressor(n_estimators=n_estimators, 
                                                  max_depth=max_depth, 
                                                  random_state=313, 
                                                  n_jobs=-1)
                estimator.fit(X_train, y_train)
                
                # calculate performance metric(s)
                y_pred = estimator.predict(X_test)
                
                rmse = sqrt(mean_squared_error(y_test, estimator.predict(X_test))) # RMSE
                #mae = mean_absolute_error(y_test, y_pred)
                
                # compare model RMSE with existing minimum and save the best model
                if rmse < min_rmse: 
                    min_rmse = rmse
                    model['estimator'], model['RMSE'], model['train_size'] = estimator, min_rmse, int(len(X_train))
                    
    return model    

#### Generate predictor objects for the selected subset of grid cells

In [5]:
def cellEstimator(observationDict): 
    """
    Description: 
        Generates time series dataset for the input grid cell and fits an estimator model to it. 
    Arguments:
        observationDict: Dictionary containing observation DataFrames for all cells. 
    Returns: 
        cellModels: Text file containing a dictionary with cell id as key, and a dictionary of estimator model 
        resulted from walkForwardValidate function as value. 
    """
    
    import _pickle as cPickle 

    with open('cellModels.txt', 'wb') as fout:

        for cell in observationDict.keys(): 

            cmodel = {}

            # extract obsevation datafrmae
            tsDataFrame = observationDict[cell]

            # build the estimator
            cmodel = walkForwardValidate(tsDataFrame)

            # save results to file
            cPickle.dump((cell, cmodel), fout)
            
        fout.close()


#### Read cell estimator file into a dictionary

In [6]:
def readCellModels(infile='cellModels.txt'): 
    """
    Description: 
        Reads binary file storing best estimator model for each cell into a dictionary. 
    Arguments: 
        infile: String value of file name where results from cellEstimator() are stored (default='cellModels.txt'). 
    Returns: 
        cellModels: Dictionary with grid cell id as key and RF model as value. 
    """
    
    import numpy as np
    import _pickle as cPickle
    
    cellModels = {}

    with open(infile, 'rb') as fin:
        while True:
            try:
                # unpickle (load) input file
                value = cPickle.load(fin)
                
                # save estimator to new dictionary with cell id as key
                cellModels[value[0]] = value[1]['estimator']

            except (EOFError):
                break

        fin.close()
    
    return cellModels

#### Build prediction output data set

In [7]:
def gridPredictor(dataDict, estimatatorDict, hour):
    """
    Description:
        Predicts number of accidents in each grid for a given hour. 
    Arguments:
        dataDict: A Dictionary with grid cell id as key and observations DataFrame as value (default=grid_ts).
        estimatorDict: A Dictionary with grid cell id as key and RF model as value (default=cellDict). 
        hour: Hour of the day (0-23) at which prediction will be made.
    Returns:
        outfile: A CSV file containing an array of grid cell ids and predicted number of accidents. 
    """
    
    import numpy as np
    import csv 
    
    with open('Predicted_Counts.csv', 'w') as outfile: 
        # create writer object
        writer = csv.writer(outfile)
        
        # write column names
        writer.writerow(['cell', 'pred_count'])
        
        try: 
            if not (hour in np.arange(24)): 
                raise ValueError('Input hour value is out of range (0 to 23).')

            for cell in dataDict.keys():
                # extract input data set from grid dictionary
                observed = dataDict[cell].iloc[dataDict[cell].index == hour, :3]

                # run prediction algorithm
                model = estimatatorDict[cell]
                predicted = np.ceil(model.predict(observed)) 

                # save results to outfile
                writer.writerow([str(cell), int(predicted)])

            outfile.close()

        except ValueError as e:
            print('ValueError: %s' % e.message)
        except MemoryError as e:
            print('MemoryError: %s' % e.message)


#### Generate observations DataFrame

In [8]:
# Call readCSV function
df = readCSV(infile='ALL_JOIN_FIPS1900_Join_GRID.csv')

# Filter out grid cells within the municipal boundary of Baltimore (MUNI_CODE = '999')
df = df.query("MUNI_CODE == '999'")

#### Generate accidents count array & save to CSV file

In [9]:
import csv 

data = countData(df)

#### Build a dictionary of all cells in the grid with associated supervised learning data sets

In [10]:
grid_ts = {}

for cell in data[:,0]:
    grid_ts[cell] = series_to_supervised(data=list(data[data[:,0] == cell, 2]), n_in=3, n_out=1)


### <font color=red>SKIP THIS BLOCK</font>
#### <font color=green>Results are provided in cellModels.txt file and loaded in the next block.</font>

In [17]:
# train estimator models by cell
# cellEstimator(observationDict=grid_ts)

#### Read cell estimator file into a dictionary

In [11]:
cellDict = readCellModels(infile='cellModels.txt')

#### Predict number of accidents for each cell at a specific hour

In [24]:
gridPredictor(dataDict=grid_ts, estimatatorDict=cellDict, hour=17)

#### Extract observed counts for hour = 17

In [None]:
with open("Observed_Counts_17.csv", "w") as outfile: 
    # writer object
    writer = csv.writer(outfile)
    
    # write column names
    writer.writerow(['cell','hour','count'])
    
    # save results array to .CSV file
    writer.writerows(data[data[:,1] == 17, :])