In [1]:
import pylab
import pandas as pd
df = pd.read_csv("events_train.tsv", na_values=['-'], delimiter="\t", error_bad_lines=False)

In [2]:
df_prediction_trial = pd.read_csv("prediction_trials.tsv", na_values=['-'], delimiter="\t", error_bad_lines=False)

In [3]:
def set_Start_Timestamp(row):
    if pd.isnull(row["start_tstamp"]) == False:
        return row["start_tstamp"]
    elif pd.isnull(row["confirmed_tstamp"]) == False:
        return row["confirmed_tstamp"]
    elif pd.isnull(row["created_tstamp"]) == False:
        return row["created_tstamp"]
    else: 
        return None

df["for_prediction_timestamp"] = df.apply(lambda row: set_Start_Timestamp(row), axis = 1)
df["count"] = df["for_prediction_timestamp"]

In [4]:
df = df[~ pd.isnull(df["for_prediction_timestamp"]) == True]

In [5]:
event_types = {"accidentsAndIncidents" : "1",
"roadwork" : "2",
"precipitation" : "3",
"deviceStatus" : "4",
"obstruction" : "5",
"trafficConditions" : "6"
}

In [6]:
def assign_Feature_Event_Type(row):
    if row["event_type"] in event_types:
        return event_types[row["event_type"]]
    else:
        return None
df["feature_event_type"] = df.apply(lambda row: assign_Feature_Event_Type(row) , axis= 1)

In [7]:
df = df[~ pd.isnull(df["feature_event_type"]) == True]

In [8]:
df = df.sort_values(["latitude","longitude"])

In [9]:
df_prediction_trial = df_prediction_trial.sort_values(["se_lat","nw_lon","nw_lat","se_lon"])

In [10]:
df_prediction_trial["start"] = pd.to_datetime(df_prediction_trial["start"])
df_prediction_trial["end"] = pd.to_datetime(df_prediction_trial["end"])

In [11]:
df["for_prediction_timestamp"] = pd.to_datetime(df["for_prediction_timestamp"])
df["closed_tstamp"] = pd.to_datetime(df["closed_tstamp"])

In [12]:
df_acci_inci = df[df["event_type"] == "accidentsAndIncidents"]
df_road_work = df[df["event_type"] == "roadwork"]
df_precipitation = df[df["event_type"] == "precipitation"]
df_deviceStatus = df[df["event_type"] == "deviceStatus"]
df_obstruction = df[df["event_type"] == "obstruction"]
df_trafficConditions = df[df["event_type"] == "trafficConditions"]


event_types_df = {"accidentsAndIncidents" : df_acci_inci,
"roadwork" : df_road_work,
"precipitation" : df_precipitation,
"deviceStatus" : df_deviceStatus,
"obstruction" : df_obstruction,
"trafficConditions" : df_trafficConditions
}

In [13]:
df['month'] = df['for_prediction_timestamp'].dt.month
df['year'] = df['for_prediction_timestamp'].dt.year

In [17]:
import time
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
import numpy as np

reg_iter = 1
total_error_reg = []
total_error_rbf = []
total_error_poly = []
total_error_svr = []

def train_regression_model(x_train, y_train, x_test, y_test, x_predict):
    if len(x_train) < 2 or len(x_test) == 0 or (len(x_train) != len(y_train)) or (len(x_test) != len(y_test)) :
        return np.zeros(2)

    global reg_iter
    reg_iter = reg_iter + 1
    
    regression_model = LinearRegression()
    ridge_rbf = KernelRidge(alpha=0.05, kernel='rbf')
    ridge_poly = KernelRidge(alpha=0.05, kernel='sigmoid')
    svr_model = SVR(C=2000.0, epsilon=0.2)
    
    regression_model.fit(x_train, y_train)
    ridge_rbf.fit(x_train, y_train)
    ridge_poly.fit(x_train, y_train)
    svr_model.fit(x_train, y_train)

    predicted_regression = regression_model.predict(x_test)
    predicted_ridge_rbf = ridge_rbf.predict(x_test)
    predicted_ridge_poly = ridge_poly.predict(x_test)
    predicted_svr = svr_model.predict(x_test)

    error_reg = np.mean((predicted_regression - y_test) ** 2)
    total_error_reg.append(error_reg)
    
    error_rbf = np.mean((predicted_ridge_rbf - y_test) ** 2)
    total_error_rbf.append(error_rbf)
    
    error_poly = np.mean((predicted_ridge_poly - y_test) ** 2)
    total_error_poly.append(error_poly)
    
    error_svr = np.mean((predicted_svr - y_test) ** 2)
    total_error_svr.append(error_svr)
    
    return svr_model.predict(x_predict)

In [15]:
df_event = df
del df_event["event_id"]
del df_event["event_description"]
del df_event["start_tstamp"]
del df_event["confirmed_tstamp"]
del df_event["created_tstamp"]
del df_event["event_subtype"]
del df_event["location"]
del df_event["number_of_responders"]
del df_event["lanes_affected"]
del df_event["feature_event_type"]

In [16]:
df_prediction_trial['month1'] = df_prediction_trial['start'].dt.month
df_prediction_trial['month2'] = df_prediction_trial['end'].dt.month

df_prediction_trial['year1'] = df_prediction_trial['start'].dt.year
df_prediction_trial['year2'] = df_prediction_trial['end'].dt.year


In [20]:
import time
import math
from numpy import array

lat_Long_Dict = {}
count = 0
start = time.time()
output_arr = []
total_error_reg = []
total_error_rbf = []
total_error_poly = []
total_error_svr = []

def create_Lat_Long_Dict(row):
    columns1 = ['month1', 'year1']
    columns2 = ['month2', 'year2']
    
    #x_predict = np.append(np.array([row["month1"],row["year1"]]), np.array([row["month2"],row["year2"]]))

    df1 = pd.DataFrame(row[columns1])
    df1 = df1.transpose()
    df2 = pd.DataFrame(row[columns2])
    df2 = df2.transpose()
    df2.columns = columns1
    x_predict = df1.append(df2)
    intermediate_output = []
    m =  df_event[(row["se_lat"] < df_event["latitude"]) & (row["nw_lat"] > df_event["latitude"]) & (row["se_lon"] > df_event["longitude"]) & (row["nw_lon"] < df_event["longitude"])]
    if(len(m) > 0):
        data = []
        
        test = m[(m["event_type"] == "accidentsAndIncidents") & (m["year"] == )]
        test.to_csv("test.tsv",sep="\t", mode='w', index=False, header=None)
        
        data.append(m[m["event_type"] == "accidentsAndIncidents"])
        data.append(m[m["event_type"] == "roadwork"])
        data.append(m[m["event_type"] == "precipitation"])
        data.append(m[m["event_type"] == "deviceStatus"])
        data.append(m[m["event_type"] == "obstruction"])
        data.append(m[m["event_type"] == "trafficConditions"])
        
        for i in range(0,6,1):
            cur_data = data[i]
            cur_data = cur_data.groupby(["year","month"])["count"].count().reset_index()
            columns = ['month', 'year']
            if (len(cur_data) != 0):
                data_test = cur_data[cur_data['year'] == 2014]
                data_train = cur_data[cur_data['year'] != 2014]
                x_train, y_train, x_test, y_test = data_train[columns] ,data_train['count'], data_test[columns], data_test['count']
                y = train_regression_model(x_train, y_train, x_test, y_test, x_predict)
                mean_y = np.mean(y,axis=0)
                if mean_y > 0:
                    intermediate_output.append(math.ceil(mean_y))
                else:
                    intermediate_output.append(0)
            else:
                intermediate_output.append(0);
            
    output_arr.append(intermediate_output)   
    global count
    global start
    count = count + 1
    if count%100 == 0:
        end = time.time()
        print (str(count) + '\t' + str(end - start))
        start = time.time()

test = df_prediction_trial.head(1).apply(lambda row:create_Lat_Long_Dict(row), axis= 1)
output_arr_final = array(output_arr) 
df_final_output = pd.DataFrame(output_arr_final)
df_final_output.to_csv("svr_model_RegressionOutput.tsv",sep="\t", mode='w', index=False, header=None)

print('linear reg mse : ' + str(math.sqrt(np.mean(total_error_reg,axis=0))))
print('rbf reg mse : ' + str(math.sqrt(np.mean(total_error_rbf,axis=0))))
print('poly reg mse : ' + str(math.sqrt(np.mean(total_error_poly,axis=0))))
print('svm reg mse : ' + str(math.sqrt(np.mean(total_error_svr,axis=0))))

linear reg mse : 6.944399081371148
rbf reg mse : 8.44736268498726
poly reg mse : 8.489400909696874
svm reg mse : 7.785465494681817


In [None]:
import keras

In [None]:
def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, init='normal', activation='relu'))
    model.add(Dense(6, init='normal', activation='relu'))
    model.add(Dense(1, init='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

def keras_model():
    numpy.random.seed(7)
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append(('mlp', KerasRegressor(build_fn=larger_model, nb_epoch=50, batch_size=5, verbose=0)))
    pipeline = Pipeline(estimators)
    kfold = KFold(n_splits=2, random_state=seed)
    results = cross_val_score(pipeline, X, Y, cv=kfold)
    print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [23]:
from datetime import date, datetime, timedelta

def perdelta(start, end, delta):
    curr = start
    while curr < end:
        yield curr
        curr += delta

for result in perdelta(datetime(2011, 10, 10, 0, 0, 0), datetime(2011, 12, 12,0,0,0), timedelta(hours=1)):
    print (result)

2011-10-10 00:00:00
2011-10-10 01:00:00
2011-10-10 02:00:00
2011-10-10 03:00:00
2011-10-10 04:00:00
2011-10-10 05:00:00
2011-10-10 06:00:00
2011-10-10 07:00:00
2011-10-10 08:00:00
2011-10-10 09:00:00
2011-10-10 10:00:00
2011-10-10 11:00:00
2011-10-10 12:00:00
2011-10-10 13:00:00
2011-10-10 14:00:00
2011-10-10 15:00:00
2011-10-10 16:00:00
2011-10-10 17:00:00
2011-10-10 18:00:00
2011-10-10 19:00:00
2011-10-10 20:00:00
2011-10-10 21:00:00
2011-10-10 22:00:00
2011-10-10 23:00:00
2011-10-11 00:00:00
2011-10-11 01:00:00
2011-10-11 02:00:00
2011-10-11 03:00:00
2011-10-11 04:00:00
2011-10-11 05:00:00
2011-10-11 06:00:00
2011-10-11 07:00:00
2011-10-11 08:00:00
2011-10-11 09:00:00
2011-10-11 10:00:00
2011-10-11 11:00:00
2011-10-11 12:00:00
2011-10-11 13:00:00
2011-10-11 14:00:00
2011-10-11 15:00:00
2011-10-11 16:00:00
2011-10-11 17:00:00
2011-10-11 18:00:00
2011-10-11 19:00:00
2011-10-11 20:00:00
2011-10-11 21:00:00
2011-10-11 22:00:00
2011-10-11 23:00:00
2011-10-12 00:00:00
2011-10-12 01:00:00
