<a href="https://colab.research.google.com/github/adiisharma0001/Aditya-s_Portfolio/blob/master/Time_Series_of_Price_Anomaly_Detection_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Data

In [None]:
from tensorflow import keras
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import plotly.graph_objects as go

np.random.seed(1)
tf.random.set_seed(1)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed

print('Tensorflow version: ', tf.__version__)

In [None]:
df = pd.read_csv('JNJ.csv')

In [None]:
df.head()

In [None]:
df = df[['Date', 'Close']]

In [None]:
df.dtypes

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df.dtypes

In [None]:
df['Date'].min(), df['Date'].max()

# Visualize the timeseries

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['Date'], y=df['Close'], name='Close price'))
fig.update_layout(showlegend=True, title='Johnson and Johnson Stock Price 1985-2020')
fig.show()

# Preprocessing
**Train test split**

In [None]:
train, test = df.loc[df['Date'] <= '2013-09-03'], df.loc[df['Date'] > '2013-09-03']

In [None]:
train.tail()

In [None]:
test.head()

In [None]:
train.shape,test.shape

**Standardize the data**

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(train[['Close']])

train['Close'] = scaler.transform(train[['Close']])
test['Close'] = scaler.transform(test[['Close']])

# Create sequences
Convert input data into 3-D array combining TIME_STEPS. The shape of the array should be [samples, TIME_STEPS, features], as required for LSTM network.

We want our network to have memory of 30 days, so we set TIME_STEPS=30.

In [None]:

TIME_STEPS=30

def create_sequences(X, y, time_steps=TIME_STEPS):
    Xs, ys = [], []
    for i in range(len(X)-time_steps):
        Xs.append(X.iloc[i:(i+time_steps)].values)
        ys.append(y.iloc[i+time_steps])
    
    return np.array(Xs), np.array(ys)

X_train, y_train = create_sequences(train[['Close']], train['Close'])
X_test, y_test = create_sequences(test[['Close']], test['Close'])

In [None]:
print(f'Training shape: {X_train.shape}')
print(f'Testing shape: {X_test.shape}')

# Build the Model
We define the reconstruction LSTM Autoencoder architecture that expects input sequences with 30 time steps and one feature and outputs a sequence with 30 time steps and one feature.
RepeatVector() repeats the inputs 30 times.
Set return_sequences=True, so the output will still be a sequence.
TimeDistributed(Dense(X_train.shape[2])) is added at the end to get the output, where X_train.shape[2] is the number of features in the input data.

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(rate=0.2))
model.add(RepeatVector(X_train.shape[1]))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(rate=0.2))
model.add(TimeDistributed(Dense(X_train.shape[2])))
model.compile(optimizer='adam', loss='mae')
model.summary()

# Train the Model

In [None]:
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1,
                    callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min')], shuffle=False)

In [None]:
train_loss = model.evaluate(X_train, y_train, verbose=0)
train_acc = 1 - train_loss
print('Training accuracy:', train_acc)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend();

In [None]:
# model.evaluate(X_test, y_test)

test_loss = model.evaluate(X_test, y_test, verbose=0)
test_acc = 1 - test_loss
print('Training accuracy:', test_acc)

# Determine Anomalies
* Find MAE loss on the training data.
* Make the max MAE loss value in the training data as the reconstruction error threshold.
* If the reconstruction loss for a data point in the test set is greater than this reconstruction error threshold value then we will * label this data point as an anomaly.

In [None]:
import seaborn as sns
X_train_pred = model.predict(X_train, verbose=0)
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)

sns.histplot(train_mae_loss, kde=True)
plt.xlabel('Train MAE loss')
threshold = np.max(train_mae_loss)
print(f'Reconstruction error threshold: {threshold}')

In [None]:
X_test_pred = model.predict(X_test, verbose=0)
test_mae_loss = np.mean(np.abs(X_test_pred-X_test), axis=1)
sns.histplot(test_mae_loss, kde=True)
plt.xlabel('Test MAE loss')
plt.ylabel('Number of samples');

In [None]:
test_score_df = pd.DataFrame(test[TIME_STEPS:])
test_score_df['loss'] = test_mae_loss
test_score_df['threshold'] = threshold
test_score_df['anomaly'] = test_score_df['loss'] > test_score_df['threshold']
test_score_df['Close'] = test[TIME_STEPS:]['Close']

In [None]:
import matplotlib.pyplot as plt
plt.plot(test_score_df.index, test_score_df.loss, label='loss')
plt.plot(test_score_df.index, test_score_df.threshold, label='threshold')
plt.xticks(rotation=25)
plt.legend();

In [None]:
anomalies = test_score_df.loc[test_score_df['anomaly'] == True]
anomalies.head()

In [None]:
anomalies.shape

# Visualize Anomalies

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(15, 10))
# Plot the Close price and the anomalies as separate lines
plt.plot(test_score_df['Date'], test_score_df['Close'], label='Close price')
plt.plot(anomalies['Date'], anomalies['Close'], 'ro', label='Anomaly')

# Add labels and a title

plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Detected anomalies')
plt.legend()

# Display the plot
plt.show()


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=test_score_df['Date'],y=test_score_df['Close'], name="Close price"))
fig.add_trace(go.Scatter(x=anomalies['Date'], y=anomalies['Close'], name="Anomaly"))
fig.update_layout(showlegend=True, title='Detected anomalies')
fig.show()