In [1]:
%matplotlib notebook

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import datetime

## Step One: Define the time series data

In [2]:
df = pd.read_csv('./data/data.csv', parse_dates=['Date'], index_col='Date')
print(df.shape)
df.head(10)

(398, 3)


Unnamed: 0_level_0,Number of Joints,Mass (grams),Time of First Use
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-28,6.0,2.1,02:00 PM
2021-03-29,3.0,1.05,02:00 PM
2021-03-30,6.0,2.1,10:00 AM
2021-03-31,6.0,2.1,10:00 AM
2021-04-01,5.0,1.75,10:00 AM
2021-04-02,5.0,1.75,02:00 PM
2021-04-03,5.0,1.75,02:00 PM
2021-04-04,4.0,1.4,10:00 AM
2021-04-05,6.0,2.1,10:00 AM
2021-04-06,4.0,1.4,02:00 PM


In [3]:
df['Date'] = pd.to_datetime(df['Date']) 
df.sort_values(by=['Date'], inplace=True, ascending=True)
print(df)

KeyError: 'Date'

In [172]:
mass = df['Mass (grams)']
print(mass)

Date
2021-03-28    2.10
2021-03-29    1.05
2021-03-30    2.10
2021-03-31    2.10
2021-04-01    1.75
              ... 
2022-05-25    0.35
2022-05-26    0.70
2022-05-27    0.35
2022-05-28    0.70
2022-05-29    0.70
Name: Mass (grams), Length: 398, dtype: float64


In [173]:
time = df.index
print(time)

DatetimeIndex(['2021-03-28', '2021-03-29', '2021-03-30', '2021-03-31',
               '2021-04-01', '2021-04-02', '2021-04-03', '2021-04-04',
               '2021-04-05', '2021-04-06',
               ...
               '2022-05-20', '2022-05-21', '2022-05-22', '2022-05-23',
               '2022-05-24', '2022-05-25', '2022-05-26', '2022-05-27',
               '2022-05-28', '2022-05-29'],
              dtype='datetime64[ns]', name='Date', length=398, freq=None)


In [174]:
print(time.shape, mass.shape)

(398,) (398,)


## Step Two: Plot the time series data

In [175]:
# Create a figure and axis
fig, ax = plt.subplots()

# Plot the time series data as a scatter plot
ax.scatter(time, mass, marker='o', c='#067211')

# Add a title and grid to the plot
ax.set_title('Mass of Substance Use Over Time')

# Set the range of values shown on the x-axis and y-axis
print(df.index[0])
print(df.index[-1])

start = mdates.date2num(df.index[0])
end = mdates.date2num(df.index[-1])

ax.set_xlim(start, end)
ax.set_ylim(0, df['Mass (grams)'].max()+0.5)

# Label the axes
ax.set_xlabel('Date')
ax.set_ylabel('Mass (grams)')

plt.ion()


# Show the plot
plt.show()

<IPython.core.display.Javascript object>

2021-03-28 00:00:00
2022-05-29 00:00:00


## Step Three: Build a Model

In [180]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

In [187]:
# Fit an ARIMA model to the data
model = ARIMA(time.values, order=(1, 0, 1))
model_fit = model.fit()

# Make predictions using the model
predictions = model_fit.predict(start=len(df), end=len(df)+10, dynamic=False)

# Calculate the RMSE
rmse = mean_squared_error(predictions, df[-10:], squared=False)
print(f'RMSE: {rmse}')

UFuncTypeError: ufunc 'subtract' cannot use operands with types dtype('<M8[ns]') and dtype('O')