In [1]:
import argparse
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, mean_absolute_error
from sklearn import cross_validation, preprocessing
from sklearn.ensemble import ExtraTreesRegressor

from utilities import visualize_classifier



In [2]:
#load data
input_file = 'traffic_data.txt'
data = []
with open(input_file, 'r') as fp : 
    for line in fp.readlines() : 
        items = line[:-1].split(',')
        data.append(items)
        
data = np.array(data)

In [15]:
label_encoder = []
X_encoded = np.empty(data.shape)
for i, item in enumerate(data[0]) : 
    if item.isdigit() : 
        X_encoded[:, i] = data[:, i]
    else :
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(data[:, i])

X = X_encoded[:, :-1].astype(int)
Y = X_encoded[:, -1].astype(int)

In [16]:
#split into training and test set
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.25, random_state=5)

In [17]:
#ExtremeRandomForest regressor
params = {'n_estimators':100, 'max_depth':4, 'random_state':0}
regressor = ExtraTreesRegressor(**params)
regressor.fit(X_train, Y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=4,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
          verbose=0, warm_start=False)

In [18]:
#evaluating performance
Y_pred = regressor.predict(X_test)
print("Mean absolute error:", round(mean_absolute_error(Y_test, Y_pred), 2))

Mean absolute error: 7.42


In [19]:
#encoding test data
test_datapoint = ["Saturday", "10:20", "Atlanta", "no"]

test_datapoint_encoded = [-1]*len(test_datapoint)
count = 0
for i, item in enumerate(test_datapoint) : 
    if item.isdigit() : 
        test_datapoint_encoded[i] = int(test_datapoint[i])
    else :
        test_datapoint_encoded[i] = int(label_encoder[count].transform([test_datapoint[i]]))
        count += 1
        
test_datapoint_encoded = np.array(test_datapoint_encoded)

In [20]:
#predict on test data
print("Predicted traffic:", int(regressor.predict([test_datapoint_encoded])[0]))

Predicted traffic: 26
