In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)

In [26]:

def load_data():
    accidents_original = pd.read_csv("datasets/dftRoadSafety_Accidents_2016.csv",
                                     #index_col = "Accident_Index",
                                     low_memory = False)
    
    accidents_original = accidents_original.reindex(np.random.permutation(accidents_original.shape[0]))
    print("dataset shape : " + str(accidents_original.shape))
    #print(accidents_original.head())
    return accidents_original


In [27]:
def preprocess_features(accidents_original):
    features = accidents_original.filter(
        ['Day_of_Week', 'Road_Type', 'Speed_limit', 'Junction_Detail', 'Junction_Control', 'Light_Conditions',
         'Weather_Conditions', 'Road_Surface_Conditions'], axis=1)
    
    features['Speed_limit'] = features['Speed_limit']/10

    cols_to_transform = ['Day_of_Week', 'Road_Type', 'Speed_limit', 'Junction_Detail', 'Junction_Control',
                         'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions']

    features = pd.get_dummies(features, columns = cols_to_transform)
    return features

def preprocess_targets(accidents_original):
    targets = accidents_original.filter(['Accident_Severity'], axis=1)
    targets = targets - 1
    print("unique valus in target : " + str(targets['Accident_Severity'].unique()))
    return targets

In [28]:
def my_input_fn(features, targets, batch_size=1000, shuffle=None, num_epochs=None):
    features = {key: np.array(value) for key, value in dict(features).items()}

    ds = Dataset.from_tensor_slices((features, targets))
    
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    if shuffle:
        ds = ds.shuffle(buffer_size=100000)

    features, labels = ds.make_one_shot_iterator().get_next()
    
    return features, labels

In [29]:
def split_into_train_test(accidents_features, accidents_targets, test_size):
    features_train, fetures_test, targets_train, targets_test = train_test_split(accidents_features, accidents_targets, test_size=test_size)
    print("features-training shape : " + str(features_train.shape))
    print("targets-training shape : " + str(targets_train.shape))
    print("features-test shape : " + str(fetures_test.shape))
    print("targets-test shape : " + str(targets_test.shape))
    return features_train, fetures_test, targets_train, targets_test 


In [30]:
def construct_feature_column(features):
    return set([tf.feature_column.numeric_column(f) for f in features])


In [31]:

def get_optimizer(optimizer, learning_rate):
    if optimizer == "GD":
        print("initilizing GD Optimizer")
        my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    elif optimizer == "ADAGRAD":
        print("initializing ADAGRAD")
        my_optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
    elif optimizer == "ADAM":
        print("initializing ADAM")
        my_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    return tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)


def construct_model(features_train, hidden_units, model, my_optimizer):
    feature_columns = construct_feature_column(features_train)
    if model == "linear":
        print("initializing linear classfier")
        classifier = tf.estimator.LinearClassifier(feature_columns=feature_columns,
                                                   n_classes=3,
                                                   optimizer=my_optimizer)
    elif model == "DNN":
        print("initializing DNN classifier")
        classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns,
                                                n_classes=3,
                                                hidden_units=hidden_units,
                                                optimizer=my_optimizer)
    return classifier


def plot_log_loss(training_log_losses, validation_log_losses):
    plt.title("LogLoss vs Iterations")
    plt.xlabel("Iterations")
    plt.ylabel("LogLoss")
    plt.tight_layout()
    plt.plot(training_log_losses, label="training")
    plt.plot(validation_log_losses, label="validation")
    
    
def train_model(training_features, training_targets,
                validation_features, validation_targets,
                model="linear", optimizer="GD", hidden_units=None,
                learning_rate=None, steps=None, batch_size=None):
    iterations = 10
    steps_per_iteration = steps / iterations
    print("steps per iteration : " + str(steps_per_iteration))
    
    my_optimizer = get_optimizer(optimizer, learning_rate)

    classifier = construct_model(training_features, hidden_units, model, my_optimizer)

    training_input_fn = lambda : my_input_fn(training_features, training_targets['Accident_Severity'], 
                                             batch_size=batch_size, shuffle=False)
    
    predict_training_input_fn = lambda  : my_input_fn(training_features, training_targets['Accident_Severity'], 
                                                      num_epochs=1, shuffle=False)
    
    predict_validation_input_fn = lambda : my_input_fn(validation_features, validation_targets['Accident_Severity'], 
                                                       num_epochs=1, shuffle=False)
    
    print("training model...")
    training_log_losses = []
    validation_log_losses = []
    for iteration in range(iterations):
        classifier.train(input_fn=training_input_fn, steps=steps_per_iteration)
        
        training_predictions = classifier.predict(input_fn=predict_training_input_fn)
        training_predictions = np.array([item['probabilities'] for item in training_predictions])
        training_log_loss = metrics.log_loss(training_targets, training_predictions)
        
        # validation_predictions = classifier.predict(input_fn=predict_validation_input_fn)
        # validation_predictions = np.array([item['probabilities'] for item in validation_predictions])
        # validation_log_loss = metrics.log_loss(validation_targets, validation_predictions)

        print("  iteration %02d : %0.2f" % (iteration, training_log_loss))
        # # Add the loss metrics from this period to our list.
        # training_log_losses.append(training_log_loss)
        # validation_log_losses.append(validation_log_loss)

    print("training model finished")
    plot_log_loss(training_log_losses, validation_log_losses)
    return classifier


In [32]:

#entry point
accidents_original = load_data()

accidents_features = preprocess_features(accidents_original)

accidents_targets = preprocess_targets(accidents_original)

print("features : " + str(list(accidents_features.columns.values)))
print("target : " + str(list(accidents_targets.columns.values)))

training_features, validation_features, training_targets, validation_targets = \
    split_into_train_test(accidents_features, accidents_targets, 0.2)


dataset shape : (136621, 32)
unique valus in target : [2 1 0]
features : ['Day_of_Week_1', 'Day_of_Week_2', 'Day_of_Week_3', 'Day_of_Week_4', 'Day_of_Week_5', 'Day_of_Week_6', 'Day_of_Week_7', 'Road_Type_-1', 'Road_Type_1', 'Road_Type_2', 'Road_Type_3', 'Road_Type_6', 'Road_Type_7', 'Road_Type_9', 'Speed_limit_2.0', 'Speed_limit_3.0', 'Speed_limit_4.0', 'Speed_limit_5.0', 'Speed_limit_6.0', 'Speed_limit_7.0', 'Junction_Detail_-1', 'Junction_Detail_0', 'Junction_Detail_1', 'Junction_Detail_2', 'Junction_Detail_3', 'Junction_Detail_5', 'Junction_Detail_6', 'Junction_Detail_7', 'Junction_Detail_8', 'Junction_Detail_9', 'Junction_Control_-1', 'Junction_Control_0', 'Junction_Control_1', 'Junction_Control_2', 'Junction_Control_3', 'Junction_Control_4', 'Light_Conditions_-1', 'Light_Conditions_1', 'Light_Conditions_4', 'Light_Conditions_5', 'Light_Conditions_6', 'Light_Conditions_7', 'Weather_Conditions_-1', 'Weather_Conditions_1', 'Weather_Conditions_2', 'Weather_Conditions_3', 'Weather_Cond

In [21]:
print(accidents_features.describe())


       Day_of_Week_1  Day_of_Week_2  Day_of_Week_3  Day_of_Week_4  \
count  136621.000000  136621.000000  136621.000000  136621.000000   
mean        0.109968       0.141091       0.147152       0.152766   
std         0.312852       0.348117       0.354258       0.359763   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         1.000000       1.000000       1.000000       1.000000   

       Day_of_Week_5  Day_of_Week_6  Day_of_Week_7   Road_Type_-1  \
count  136621.000000  136621.000000  136621.000000  136621.000000   
mean        0.153337       0.165267       0.130419       0.000007   
std         0.360313       0.371423       0.336765       0.002705   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.0000

In [34]:

linear_classifier = train_model(training_features, training_targets,
                                validation_features, validation_targets,
                                model="linear", hidden_units= [20,10], optimizer="ADAM",
                                learning_rate=0.01, steps=1000, batch_size=1000)

#predict_measure(linear_classifier, fetures_test, targets_test)


steps per iteration : 100.0
initializing ADAM
initializing linear classfier
training model...


  iteration 00 : 0.49


  iteration 01 : 0.49


  iteration 02 : 0.49


  iteration 03 : 0.49


  iteration 04 : 0.49


  iteration 05 : 0.49


  iteration 06 : 0.49


  iteration 07 : 0.49


  iteration 08 : 0.49


  iteration 09 : 0.49
training model finished


In [37]:
predict_validation_input_fn = lambda : my_input_fn(validation_features, validation_targets['Accident_Severity'],
                                                   num_epochs=1, shuffle=False)

evaluation_metrics = linear_classifier.evaluate(input_fn=predict_validation_input_fn)
print(evaluation_metrics)
#print("AUC on the validation set: %0.2f" % evaluation_metrics['auc'])
print("Accuracy on the validation set: %0.2f" % evaluation_metrics['accuracy'])

{'accuracy': 0.8277402, 'average_loss': 0.4958493, 'loss': 483.89578, 'global_step': 1000}


KeyError: 'auc'

In [None]:

dnn_classifier = train_model(training_features, training_targets,
                                validation_features, validation_targets,
                                model="DNN", hidden_units= [20,10], optimizer="ADAM",
                                learning_rate=0.01, steps=1000, batch_size=1000)


{'accuracy': 0.8277402, 'average_loss': 0.49768254, 'loss': 485.68484, 'global_step': 1000}


KeyError: 'auc'

In [39]:

predict_validation_input_fn = lambda : my_input_fn(validation_features, validation_targets['Accident_Severity'],
                                                   num_epochs=1, shuffle=False)

evaluation_metrics = dnn_classifier.evaluate(input_fn=predict_validation_input_fn)
print(evaluation_metrics)
print("AUC on the validation set: %0.2f" % evaluation_metrics['auc'])
print("Accuracy on the validation set: %0.2f" % evaluation_metrics['accuracy'])

{'accuracy': 0.8277402, 'average_loss': 0.49768254, 'loss': 485.68484, 'global_step': 1000}


KeyError: 'auc'