# Approach to predicting traffic volume

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

In [144]:
train = pd.read_csv("train.csv")

In [145]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  48120 non-null  object
 1   Junction  48120 non-null  int64 
 2   Vehicles  48120 non-null  int64 
 3   ID        48120 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [146]:
train['DateTime'] = pd.to_datetime(train['DateTime'])

In [147]:
#train['ID'] = train.ID.astype('str')
train['Junction'] = train.Junction.astype('str')

In [148]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  48120 non-null  datetime64[ns]
 1   Junction  48120 non-null  object        
 2   Vehicles  48120 non-null  int64         
 3   ID        48120 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 1.5+ MB


In [149]:
train.describe()

Unnamed: 0,Vehicles,ID
count,48120.0,48120.0
mean,22.791334,20163300000.0
std,20.750063,5944854.0
min,1.0,20151100000.0
25%,9.0,20160420000.0
50%,15.0,20160930000.0
75%,29.0,20170230000.0
max,180.0,20170630000.0


In [150]:
train.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,2015-11-01 00:00:00,1,15,20151101001
1,2015-11-01 01:00:00,1,13,20151101011
2,2015-11-01 02:00:00,1,10,20151101021
3,2015-11-01 03:00:00,1,7,20151101031
4,2015-11-01 04:00:00,1,9,20151101041


In [151]:
#train = train.set_index('DateTime')

In [152]:
X,y = train[['Junction', 'ID']],train.Vehicles

In [153]:
X,y

(      Junction           ID
 0            1  20151101001
 1            1  20151101011
 2            1  20151101021
 3            1  20151101031
 4            1  20151101041
 ...        ...          ...
 48115        4  20170630194
 48116        4  20170630204
 48117        4  20170630214
 48118        4  20170630224
 48119        4  20170630234
 
 [48120 rows x 2 columns],
 0        15
 1        13
 2        10
 3         7
 4         9
          ..
 48115    11
 48116    30
 48117    16
 48118    22
 48119    12
 Name: Vehicles, Length: 48120, dtype: int64)

In [154]:
test = pd.read_csv('test.csv')

In [155]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11808 entries, 0 to 11807
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  11808 non-null  object
 1   Junction  11808 non-null  int64 
 2   ID        11808 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 276.9+ KB


In [156]:

test.head()

Unnamed: 0,DateTime,Junction,ID
0,7/1/17 0:00,1,20170701001
1,7/1/17 1:00,1,20170701011
2,7/1/17 2:00,1,20170701021
3,7/1/17 3:00,1,20170701031
4,7/1/17 4:00,1,20170701041


In [157]:
X_test = test[["Junction", "ID"]]

## Modeling

### Decision Tree

In [158]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [159]:
reg = DecisionTreeRegressor(random_state=101)
cross_val_score(reg, X, y, cv = 10)

array([-5.58028625, -0.25456281, -0.24685501,  0.47663231, -1.31800917,
        0.09311527,  0.25390948, -0.04225175, -0.15623799, -2.47540336])

In [160]:
cross_validate(reg, X, y, cv = 10)

{'fit_time': array([0.03639293, 0.0358882 , 0.02615023, 0.02374911, 0.02341986,
        0.0246501 , 0.02451801, 0.02837729, 0.02736306, 0.02395892]),
 'score_time': array([0.00299597, 0.00211191, 0.00192404, 0.00184298, 0.00176191,
        0.00175095, 0.00174379, 0.00178385, 0.00187016, 0.00184917]),
 'test_score': array([-5.58028625, -0.25456281, -0.24685501,  0.47663231, -1.31800917,
         0.09311527,  0.25390948, -0.04225175, -0.15623799, -2.47540336])}

In [243]:
reg.fit(X,y)
dt_prediction = reg.predict(X_test)

In [244]:
pred_dt_df = pd.DataFrame({"ID":X_test["ID"], "Vehicles": dt_prediction})

In [245]:
pred_dt_df.to_csv("prediction_2.csv",index=False)

### Linear Regression (naive)

In [194]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split

In [162]:
lin_reg = LinearRegression()
cross_validate(lin_reg, X, y, cv= 10)

{'fit_time': array([0.01675129, 0.01712608, 0.0183599 , 0.01554036, 0.0147512 ,
        0.0100472 , 0.01185799, 0.01072121, 0.00996876, 0.00996614]),
 'score_time': array([0.0025506 , 0.00246286, 0.00272918, 0.00254679, 0.00167298,
        0.00163507, 0.00169396, 0.00165915, 0.00157619, 0.00164819]),
 'test_score': array([-0.52392203, -0.32145083, -1.2259157 ,  0.06428272, -6.16981927,
        -4.33694396, -1.29240807, -0.62120739, -0.12800758, -0.09686078])}

In [163]:
lin_reg.fit(X,y).score(X,y)

0.5162125504761439

In [164]:
pred = lin_reg.predict(X_test)

In [165]:
pred_df = pd.DataFrame({"Vehicles":pred})

In [166]:
pred_df["ID"] = test.ID

In [195]:
trainX, testX, trainY, testY = train_test_split(X,y, test_size = 0.3, random_state = 38)

In [197]:
lin_reg.fit(trainX, trainY).score(trainX,trainY)

0.5181487898841317

In [198]:
lin_reg.fit(trainX, trainY).predict(testX)

array([19.9990691 ,  7.71350541, 37.21159822, ..., 22.12288661,
       34.54320037,  6.39327788])

In [201]:
mean_squared_error(lin_reg.fit(trainX, trainY).predict(testX), testY)

212.30762572920628

In [200]:
mean_absolute_error(lin_reg.fit(trainX, trainY).predict(testX), testY)

10.52135978265506

### Lasso


In [185]:
lin_lasso = Lasso()
lin_lasso.fit(X,y).score(X,y)

0.5135997972051176

In [202]:
mean_squared_error(lin_lasso.fit(trainX, trainY).predict(testX), testY)

213.71908462187866

In [203]:
mean_absolute_error(lin_lasso.fit(trainX, trainY).predict(testX), testY)

10.395333545900082

### Ridge

In [183]:
lin_ridge = Ridge()
lin_ridge.fit(X,y).score(X,y)

0.516212550222449

In [204]:
mean_squared_error(lin_ridge.fit(trainX, trainY).predict(testX), testY)

212.3077594912651

In [205]:
mean_absolute_error(lin_ridge.fit(trainX, trainY).predict(testX), testY)

10.521273928777983

### ElasticNet

In [229]:
lin_elastic = ElasticNet(alpha = 10e-3)
lin_elastic.fit(X,y).score(X,y)

0.5161960221853487

In [230]:
mean_squared_error(lin_elastic.fit(trainX, trainY).predict(testX), testY)

212.33847717676767

In [231]:
mean_absolute_error(lin_elastic.fit(trainX, trainY).predict(testX), testY)

10.506352839792045

In [233]:
prediction = lin_elastic.fit(X,y).predict(X_test)

In [234]:
pred_df = pd.DataFrame({"ID":X_test["ID"], "Vehicles": prediction})

In [235]:
pred_df

Unnamed: 0,ID,Vehicles
0,20170701001,50.255056
1,20170701011,50.255069
2,20170701021,50.255083
3,20170701031,50.255096
4,20170701041,50.255109
...,...,...
11803,20171031194,5.978844
11804,20171031204,5.978857
11805,20171031214,5.978871
11806,20171031224,5.978884


In [236]:
pred_df.to_csv("submission1.csv", index=False)

## Neural Networks

In [237]:
### Forecasting models

In [240]:

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [239]:
#conda install keras

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: done

## Package Plan ##

  environment location: /usr/local/anaconda3

  added / updated specs:
    - keras


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _tflow_select-2.3.0        |              mkl           3 KB
    absl-py-0.9.0              |           py37_0         164 KB
    astor-0.8.0                |           py37_0          46 KB
    c-ares-1.15.0              |    h1de35cc_1001          73 KB
    conda-4.8.3                |           py37_0         2.8 MB
    gast-0.3.

In [250]:
# define the model
def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(4, input_dim=2, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, kernel_initializer='normal', activation='relu'))
    model.add(Dense(10, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model


# evaluate model with standardized dataset
estimators = []
#estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, epochs=10, batch_size=20, verbose=2)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10)
results = cross_val_score(pipeline, X, y, cv=kfold)
print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Epoch 1/10
 - 5s - loss: 230.2649
Epoch 2/10
 - 4s - loss: 140.9574
Epoch 3/10
 - 4s - loss: 137.9913
Epoch 4/10
 - 4s - loss: 137.4890
Epoch 5/10
 - 5s - loss: 137.2069
Epoch 6/10
 - 4s - loss: 136.8326
Epoch 7/10
 - 4s - loss: 136.7601
Epoch 8/10
 - 4s - loss: 136.8356
Epoch 9/10
 - 4s - loss: 136.5841
Epoch 10/10
 - 4s - loss: 136.6900
Epoch 1/10
 - 5s - loss: 214.5929
Epoch 2/10
 - 4s - loss: 131.0442
Epoch 3/10
 - 4s - loss: 121.3044
Epoch 4/10
 - 4s - loss: 120.7792
Epoch 5/10
 - 4s - loss: 120.5200
Epoch 6/10
 - 4s - loss: 120.5242
Epoch 7/10
 - 4s - loss: 120.4289
Epoch 8/10
 - 5s - loss: 120.5228
Epoch 9/10
 - 4s - loss: 120.3839
Epoch 10/10
 - 4s - loss: 120.4406
Epoch 1/10
 - 5s - loss: 152.8459
Epoch 2/10
 - 5s - loss: 100.4097
Epoch 3/10
 - 5s - loss: 97.6344
Epoch 4/10
 - 4s - loss: 96.1987
Epoch 5/10
 - 4s - loss: 94.2705
Epoch 6/10
 - 4s - loss: 91.8215
Epoch 7/10
 - 5s - loss: 88.8282
Epoch 8/10
 - 4s - loss: 86.7618
Epoch 9/10
 - 4s - loss: 86.1447
Epoch 10/10
 - 4s -

In [258]:
nn_prediction = pipeline.fit(X,y).predict(X_test)

Epoch 1/10
 - 6s - loss: 206.1492
Epoch 2/10
 - 5s - loss: 143.0825
Epoch 3/10
 - 5s - loss: 141.6469
Epoch 4/10
 - 5s - loss: 141.4112
Epoch 5/10
 - 5s - loss: 141.5023
Epoch 6/10
 - 5s - loss: 141.2622
Epoch 7/10
 - 5s - loss: 141.4958
Epoch 8/10
 - 6s - loss: 141.3571
Epoch 9/10
 - 5s - loss: 141.3594
Epoch 10/10
 - 5s - loss: 141.3815


In [289]:
def nn_model():
    # create model
    model = Sequential()
    model.add(Dense(4, input_dim=2, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, kernel_initializer='normal', activation='relu'))
    model.add(Dense(10, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='rmsprop')
    return model

# evaluate model with standardized dataset
estimators2 = []
estimators2.append(('standardize', StandardScaler()))
estimators2.append(('mlp', KerasRegressor(build_fn=nn_model, epochs=30, batch_size=1, verbose=2, validation_split = 0.20)))
pipeline2 = Pipeline(estimators2)


In [290]:
nn_prediction = pipeline2.fit(X,y, mlp__shuffle = True).predict(X_test)

Train on 38496 samples, validate on 9624 samples
Epoch 1/30
 - 89s - loss: 179.2683 - val_loss: 103.0725
Epoch 2/30
 - 87s - loss: 158.8929 - val_loss: 102.6515
Epoch 3/30
 - 86s - loss: 158.9395 - val_loss: 103.3399
Epoch 4/30
 - 96s - loss: 159.4183 - val_loss: 102.2754
Epoch 5/30
 - 95s - loss: 160.4980 - val_loss: 101.9758
Epoch 6/30
 - 86s - loss: 161.2708 - val_loss: 103.4045
Epoch 7/30
 - 86s - loss: 162.9724 - val_loss: 99.6534
Epoch 8/30
 - 86s - loss: 163.7270 - val_loss: 111.1190
Epoch 9/30
 - 100s - loss: 164.0456 - val_loss: 105.8191
Epoch 10/30
 - 101s - loss: 164.2689 - val_loss: 99.9730
Epoch 11/30
 - 101s - loss: 163.9355 - val_loss: 100.3633
Epoch 12/30
 - 101s - loss: 164.1809 - val_loss: 103.2956
Epoch 13/30
 - 101s - loss: 165.0479 - val_loss: 104.2694
Epoch 14/30
 - 100s - loss: 163.7242 - val_loss: 99.6447
Epoch 15/30
 - 89s - loss: 163.6224 - val_loss: 103.2162
Epoch 16/30
 - 86s - loss: 163.6986 - val_loss: 99.7123
Epoch 17/30
 - 87s - loss: 164.1646 - val_loss

In [291]:
pred_df_nn = pd.DataFrame({"ID":X_test["ID"], "Vehicles": nn_prediction})

In [292]:
pred_df_nn.to_csv("prediction_3.csv", index=False)

<b> In the end, the best RMSE obtained for the problem using the Neural Networks was 11.24 </b>