## Exponential Smoothing Model

Using seasonality because the time of year/month/day may have an impact on the number of trips. 

In [107]:
import pandas as pd
from statsmodels.tsa.api import ExponentialSmoothing
## Imported for testing
import numpy as np

# Get data.
data = pd.read_csv("https://github.com/dustywhite7/econ8310-assignment1/raw/main/assignment_data_train.csv")
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
#data['trips'] = data['trips'].astype(int)

trips = data['trips']
trips.index = data['Timestamp']
trips.index.freq = trips.index.inferred_freq

# Linear trend
##trend = ExponentialSmoothing(trips, trend='add', seasonal='add').fit()
# Linear trend with damping
model = ExponentialSmoothing(trips, trend='add', seasonal='add', use_boxcox=True)
modelFit = model.fit(use_brute=True)

#forecast_t = trend.forecast(744)
pred = modelFit.forecast(744)

print(pred)

# calculate RMSE (root mean squared error)
dataTest = pd.read_csv("tests/testData.csv")['trips']
rmse = sum([(np.squeeze(pred)[i]-dataTest[i])**2 for i in range(len(np.squeeze(pred)))])
rmse = np.sqrt(rmse)*1/744

print(rmse)

2019-01-01 00:00:00     5496.086057
2019-01-01 01:00:00     1883.100376
2019-01-01 02:00:00             NaN
2019-01-01 03:00:00             NaN
2019-01-01 04:00:00             NaN
                           ...     
2019-01-31 19:00:00    14000.697788
2019-01-31 20:00:00    12917.473248
2019-01-31 21:00:00    12950.523437
2019-01-31 22:00:00    12093.011998
2019-01-31 23:00:00    10128.158398
Freq: h, Length: 744, dtype: float64
nan



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



## Prophet Model

Using Prophet because we only have one variable we need to forecast.

In [None]:
import pandas as pd
from prophet import Prophet
## Imported for testing
import numpy as np

# Get data.
data = pd.read_csv("https://github.com/dustywhite7/econ8310-assignment1/raw/main/assignment_data_train.csv")
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

# Get only timestamp and time series data from the dataframe.
data_p = data[['Timestamp', 'trips']]
data_p.columns = ['ds', 'y'] # Renaming the columns per Prophet's requirements.

model = Prophet()
modelFit = model.fit(data_p)

# Get future data to make predictions.
data_test = pd.read_csv("https://github.com/dustywhite7/econ8310-assignment1/raw/main/assignment_data_test.csv")
data_f = data_test[['Timestamp']]
data_f.columns = ['ds']
# Create an empty dataframe with dates for future periods
pred = modelFit.predict(data_f)
pred = pred[['trend']]
pred['trend'] = pred['trend'].astype(int)

print(pred)

#greebo

# calculate RMSE (root mean squared error)
dataTest = pd.read_csv("tests/testData.csv")['trips']
rmse = sum([(np.squeeze(pred)[i]-dataTest[i])**2 for i in range(len(np.squeeze(pred)))])
rmse = np.sqrt(rmse)*1/744

print(rmse)


15:41:44 - cmdstanpy - INFO - Chain [1] start processing
15:41:45 - cmdstanpy - INFO - Chain [1] done processing


     trend
0    10425
1    10424
2    10423
3    10422
4    10421
..     ...
739   9732
740   9731
741   9730
742   9729
743   9728

[744 rows x 1 columns]
190.74211136516297


## GAM Model

Trying a Generalized Additive Model using pyGAM.

In [123]:
from pygam import LinearGAM, s, f
import pandas as pd

# Get data.
data = pd.read_csv("https://github.com/dustywhite7/econ8310-assignment1/raw/main/assignment_data_train.csv")
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

# Grab timestamp columns and data.
x = data[['year', 'month', 'day', 'hour']]
y = data['trips']

# Create the model.
model = LinearGAM(f(0) + f(1) + s(2) + s(3))
# Fit the model.
modelFit = model.fit(x, y)

# Make predictions using the fitted model.
pred = modelFit.predict(x)
pred = pd.DataFrame({'trend': pred}) # Convert to DF.
pred = pred.tail(744)
pred = pred.reset_index(drop=True)

print(pred)

# calculate RMSE (root mean squared error)
dataTest = pd.read_csv("tests/testData.csv")['trips']
rmse = sum([(np.squeeze(pred)[i]-dataTest[i])**2 for i in range(len(np.squeeze(pred)))])
rmse = np.sqrt(rmse)*1/744

print(rmse)

            trend
0     8025.177700
1     5368.187486
2     3590.291426
3     2399.485453
4     1616.298643
..            ...
739  16444.960644
740  15039.224364
741  14757.761343
742  13988.169849
743  11275.778804

[744 rows x 1 columns]
103.40046624045357
