In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL
import random

In [None]:
# Load in the data
df = pd.read_csv(r"../data/first_1000_JSON_files.csv", low_memory=True)

# Create a copy of the dataframe to work with
df_copy = df.copy()

# # change the value in the imei column to display  from scientific notation to standard notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
# change imei column to a int64 data type
df_copy['imei'] = df_copy['imei'].astype('int64')

In [None]:
# only keep the rows where the value in the column 'imei' is 868500050064237
df_first_iot = df_copy[df_copy['imei'] == 868500050064237]

In [None]:
# create a dataframe with the columns 'imei' ,'dtd' and 'event.metric.btemp'
df_first_iot_btemp = df_first_iot[['imei', 'dtd', 'tracker.metric.bbatp','event.key']]

In [None]:
df_first_iot_btemp

In [None]:
# show the value counts of the column 'device.metric.dactualsp'
df_first_iot_btemp['tracker.metric.bbatp'].value_counts()

In [None]:
# show the amount of rows where 'device.metric.btemp' is not null
df_first_iot_btemp['device.metric.dactualsp'].count()

# only keep these rows
df_first_iot_btemp = df_first_iot_btemp[df_first_iot_btemp['device.metric.dactualsp'].notnull()]

# change the data type of the column 'device.metric.btemp' to an int64
df_first_iot_btemp['device.metric.dactualsp'] = df_first_iot_btemp['device.metric.dactualsp'].astype('int64')

# change the data type of the column 'dtd' to a datetime
df_first_iot_btemp['dtd'] = pd.to_datetime(df_first_iot_btemp['dtd'])

In [None]:
df_first_iot_btemp.dtypes

In [None]:
df_first_iot_btemp

In [None]:
df_first_iot_btemp['days'] = df_first_iot_btemp['dtd'] - df_first_iot_btemp['dtd'].min()
df_first_iot_btemp['hours'] = df_first_iot_btemp['days'].dt.total_seconds() / 3600

In [None]:
# if the date in the 'dtd' column occurs more than once, only keep the first occurence
df_first_iot_btemp = df_first_iot_btemp.drop_duplicates(subset='dtd', keep='first')

In [None]:
# only keep the wors where device.metric.dactualsp is greater than 0
df_first_iot_btemp = df_first_iot_btemp[df_first_iot_btemp['device.metric.dactualsp'] > 0]

In [None]:
# only show some rows where event.key is not nan
# df_first_iot_btemp = df_first_iot_btemp[df_first_iot_btemp['event.key'].notnull()]

# # show unique values in the device.metric.bsocp column
# df_first_iot_btemp['device.metric.dactualsp'].value_counts()

In [None]:
plt.rc('font', size=12)
fig, ax = plt.subplots(figsize=(15, 6))

plt.plot(df_first_iot_btemp['days'], df_first_iot_btemp['device.metric.dactualsp'], linewidth=1)

ax.set_xlabel('Date')
ax.set_ylabel('Battery State of Charge Percentage')
ax.grid(True)
ax.legend(loc='upper left');

In [None]:
# # create a plot for the first 96 hours
fig, ax = plt.subplots(figsize=(15, 6))

plt.plot(df_first_iot_btemp['hours'][:73], df_first_iot_btemp['device.metric.dactualsp'][:73], linewidth=1)

ax.set_xlabel('Hours')
ax.set_ylabel('Battery State of Charge Percentage')
ax.grid(True)
ax.legend(loc='upper left');

In [None]:
lowess = sm.nonparametric.lowess(df_first_iot_btemp['device.metric.dactualsp'], df_first_iot_btemp['days'], frac=0.1)
lowess_x = list(zip(*lowess))[0]
lowess_y = list(zip(*lowess))[1]

In [None]:
# this but with the first 96 hours
lowess = sm.nonparametric.lowess(df_first_iot_btemp['device.metric.dactualsp'][:73], df_first_iot_btemp['hours'][:73], frac=0.1)
lowess_x = list(zip(*lowess))[0]
lowess_y = list(zip(*lowess))[1]

plt.rc('font', size=12)
fig, ax = plt.subplots(figsize=(15, 6))

plt.plot(df_first_iot_btemp['hours'][:73], df_first_iot_btemp['device.metric.dactualsp'][:73])
plt.plot(lowess_x, lowess_y)

ax.set_xlabel('Hours')
ax.set_ylabel('Battery State of Charge Percentage')
ax.grid(True)
ax.legend(loc='upper left');

In [None]:
# decompose the time series
stl = STL(df_first_iot_btemp['device.metric.dactualsp'].to_numpy(), period= 96)
res = stl.fit()

In [None]:
seasonal, trend, resid = res.seasonal, res.trend, res.resid

cleaned_data = seasonal + trend

In [None]:
plt.figure(figsize = (15, 12))

plt.subplot(3,1,1)
plt.plot(seasonal)
plt.title('Seasonal')

plt.subplot(3,1,2)
plt.plot(trend)
plt.title('Trend')

plt.subplot(3,1,3)
plt.plot(resid)
plt.title('Residuals')

plt.show()

In [None]:
res_mean = resid.mean()
resid_std = resid.std()

lower_bound = res_mean - 2*resid_std
upper_bound = res_mean + 2*resid_std

In [None]:
plt.rc('font', size=12)
fig, ax = plt.subplots(figsize=(15, 6))

plt.plot(cleaned_data)
plt.fill_between(list(range(587)), cleaned_data - lower_bound, cleaned_data + lower_bound, color = 'g', alpha = 0.15)

ax.set_xlabel('Date')
ax.set_ylabel('speed')

# plt.ylim(bottom=0)

In [None]:
anomalies = df_first_iot_btemp[(resid < lower_bound) | (resid > upper_bound)]

In [None]:
# anomalies[['date', 'meantemp']].rename(columns={'meantemp': 'anomaly_value'}).join(
#     meantemp_data[(resid < lower_bound) | (resid > upper_bound)][['date', 'meantemp']].set_index('date'),
#     on = 'date'
# )


# make the above code work for my code
anomalies[['dtd', 'device.metric.dactualsp']].rename(columns={'device.metric.dactualsp': 'anomaly_value'}).join(
    df_first_iot_btemp[(resid < lower_bound) | (resid > upper_bound)][['dtd', 'device.metric.dactualsp']].set_index('dtd'),
    on = 'dtd'
)