In [58]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import geohash
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.model_selection import train_test_split

def zero_padding_timestamp(x):
    h,m = map(str,x.split(':'))
    h = h.zfill(2)
    m = m.rjust(2,'0')
    return h+":"+m
def feature_hour(x):
    h,m = map(str,x.split(':'))
    return float(h)
def feature_minute(x):
    h,m = map(str,x.split(':'))
    return float(m)
def feature_lat(x):
    return geohash.decode(x)[0]
def feature_long(x):
    return geohash.decode(x)[1]
def feature_weekday_weekend(x):
    if(x == 5 or x == 6):
        return 1
    return 0
def decomposeMinute(time):
    day = time // (24 * 60)
    time = time % (24 * 60)
    hour = time // 60
    minutes = time % 60
    return (int(day), int(hour), int(minutes))

"""
@type  trainingsetPath: string
@param trainingsetPath: absolute path of the input file
@type  outputsetPath: string
@param outputsetPath: absolute path of the output file
@type  debug: string
@param debug: print debugging statement if set to true
"""
def predictTestSet(trainingsetPath, outputsetPath, debug=True):
    #create the features for training data
    traffic_train = pd.read_csv(trainingsetPath)
    traffic_train = traffic_train.dropna()
    traffic_train['hour'] = traffic_train['timestamp'].apply(feature_hour)
    traffic_train['minute'] = traffic_train['timestamp'].apply(feature_minute)
    traffic_train['timestamp'] = traffic_train['timestamp'].apply(zero_padding_timestamp)
    traffic_train['lat'] = traffic_train['geohash6'].apply(feature_lat)
    traffic_train['long'] = traffic_train['geohash6'].apply(feature_long)
    traffic_train["day_of_week"] = traffic_train.apply(lambda x: x.day%7, axis=1)
    traffic_train['weekend'] = traffic_train['day_of_week'].apply(feature_weekday_weekend)
    traffic_train['datetime_in_mins'] = 24*60*(traffic_train['day']-1) + 60*traffic_train['hour'] + traffic_train['minute']
    
    T_train_max = traffic_train['datetime_in_mins'].max()
    T_train_max_day = traffic_train['day'].max()
    all_geohash = traffic_train['geohash6'].unique()
    
    if debug:
        print("datetime max is: ", T_train_max)
        print("day max is: ", T_train_max_day)
        print ("unique Geohash count:", len(all_geohash))
    
    #drop na rows
    traffic_train = traffic_train.dropna()
    #rearrange the columns
    traffic_train = traffic_train[['geohash6','day','timestamp','datetime_in_mins', 'hour','lat','long','day_of_week','weekend','demand']]
    traffic_train.head()
    
    #We take hour, lat, long, day_of_week and weekend for the training data
    traffic_data = traffic_train.iloc[:, 4:-1].values
    traffic_label = traffic_train.iloc[:, -1].values
    #create the model using the best result from our exploration
    rf_regressor = RandomForestRegressor(max_depth=40,  n_estimators=300)
    rf_regressor.fit(X=traffic_data,y=traffic_label)
    
    #Create prediction set from T+n, where n is 1 to 5 and each n is 15 minutes range.
    geohash6_arr = []
    day_arr = []
    timestamp_arr = []
    demand_arr = []
    
    #create T+15 to T+75(in minutes) for all geohash6 location
    T_plus_n = [T_train_max+15, T_train_max+30, T_train_max+45, T_train_max+60, T_train_max+75 ]
    for t in T_plus_n:
        print(t)
        for geohash in all_geohash:
            day, hour, minutes = decomposeMinute(t)
            geohash6_arr.append(geohash)
            day_arr.append(day)
            timestamp_arr.append(str(hour)+":"+str(minutes))
    
    traffic_test_set = pd.DataFrame({'geohash6': geohash6_arr, 'day': day_arr, 'timestamp': timestamp_arr})
    if debug:
        #print(geohash6_arr)
        #print(day_arr)
        #print(timestamp_arr)
        print()
    
    #create features for the test set
    traffic_test_set['hour'] = traffic_test_set['timestamp'].apply(feature_hour)
    traffic_test_set['minute'] = traffic_test_set['timestamp'].apply(feature_minute)
    traffic_test_set['timestamp'] = traffic_test_set['timestamp'].apply(zero_padding_timestamp)
    traffic_test_set['lat'] = traffic_test_set['geohash6'].apply(feature_lat)
    traffic_test_set['long'] = traffic_test_set['geohash6'].apply(feature_long)
    traffic_test_set["day_of_week"] = traffic_test_set.apply(lambda x: x.day%7, axis=1)
    traffic_test_set['weekend'] = traffic_test_set['day_of_week'].apply(feature_weekday_weekend)
    traffic_test_set['datetime_in_mins'] = 24*60*(traffic_test_set['day']-1) + 60*traffic_test_set['hour'] + traffic_test_set['minute']
    
    xTest = traffic_test_set.iloc[:, 4:-1].values
    
    y_reg_test_predict = rf_regressor.predict(xTest)
    print(y_reg_test_predict)
    
    #put the prediction result into the Panda dataframe
    traffic_test_set['demand'] = y_reg_test_predict
    
    #output the original dataset format to CSV
    traffic_test_set[['geohash6','day','timestamp','demand']].to_csv(outputsetPath, index=False)
    #print("RandomForestRegressor MSE:", mean_squared_error(yTest, y_reg_test_predict))


predictTestSet("/Users/andika/Desktop/PROJECT/MACHINELEARNING/datasets/housing/grab_train_toy.csv",
              "/Users/andika/Desktop/PROJECT/MACHINELEARNING/datasets/housing/predicted_output.csv")

#traffic_train = pd.read_csv("/Users/andika/Desktop/PROJECT/MACHINELEARNING/datasets/housing/grab_train_toy.csv")
#print (traffic_train['timestamp'].unique() )
#traffic_train = traffic_train.dropna()
#traffic_train['hour'] = traffic_train['timestamp'].apply(feature_hour)

datetime max is:  87825.0
day max is:  61
unique Geohash count: 1193
87840.0
87855.0
87870.0
87885.0
87900.0

[0.20478973 0.0330705  0.05785042 ... 0.02056688 0.01480476 0.03272401]


(60, 2, 15)
