In [None]:
import pandas as pd
import numpy as np
import math
import os
import sys
import datetime

from scipy.spatial.distance import pdist, squareform
from scipy.fftpack import fft
from sklearn.manifold import MDS
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.dates as dates
plt.style.use('seaborn-whitegrid')
%matplotlib inline
plt.rcParams['figure.figsize'] = (6.0, 4.0) # set default size of plots
import matplotlib.pyplot as plt

In [None]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/Miniprojects/Adil'
    DATA_DIR = DIR + '/Data/'
    os.chdir(DIR)
else:
    DATA_DIR = 'Data/'

In [None]:
## Read data
FILE = DATA_DIR + 'data.csv'
df = pd.read_csv(FILE, sep = ",", header = 0)
df['time'] = pd.to_datetime(df['time'], format='%m-%d-%Y %H.%M')
df.loc[:, (df.columns != 'time')] = df.loc[:, df.columns != 'time'].apply(pd.to_numeric, errors = 'coerce')
df = df.set_index('time')
df.head()

In [None]:
## Plot percentage of missing values (NaNs) for each feature
cutoff = 30
fig = plt.figure(figsize=(6, 6))
percent_missing = (df.isna().sum() / df.shape[0]) * 100
percent_missing.plot(kind = 'bar', color = cm.rainbow(np.linspace(0, 1, 2))[(percent_missing <= cutoff).values.astype(int)])
plt.plot(np.arange(df.shape[1]), np.repeat(cutoff, df.shape[1]), 'g--')
fig.suptitle('Percentage Missing Values Across All Features', fontsize = 20)
plt.xlabel('Feature', fontsize = 16)
plt.ylabel('% Missing Values', fontsize = 16)

In [None]:
## Linear interpolation for one column
#df['Cyclone_Inlet_Gas_Temp'] = df['Cyclone_Inlet_Gas_Temp'].interpolate(method = 'linear')
df.loc[:, (df.columns != 'time')] = df.loc[:, df.columns != 'time'].interpolate(method = 'linear')
(df.isna().sum() / df.shape[0]) * 100

In [None]:
## Data preparation for anomaly detection
feature = df.columns[0] # "Cyclone_Inlet_Gas_Temp"
sampling_period = 5*60 # in seconds of the dataset as provided
time_period =  12*60*60 # time duration in seconds corresoponding to each sample, in this case 12 hours
scaler = {'identity': FunctionTransformer(lambda x: x), 'standard': StandardScaler()}
df_transformed = pd.DataFrame(scaler['identity'].fit_transform(df))
df_transformed.index = df.index.copy()
df_transformed.columns = df.columns.copy()
df_anomaly = df_transformed.groupby(pd.Grouper(freq = str(time_period)+'S')).apply(lambda x: x[feature].values if len(x[feature].values) == int(pd.Timedelta(str(time_period/sampling_period)+'S').total_seconds()) else np.nan)
df_anomaly = df_anomaly.dropna()
df_anomaly.head()

In [None]:
## Data preparation for autoregression
feature = df.columns[0] # "Cyclone_Inlet_Gas_Temp"
# Sampling period of the dataset (in this case, 5 mins)
sampling_period = int(pd.Timedelta('5min').total_seconds()) # in seconds
# Lag for autoregression (in this case, we regress on the past 12 hours):
lag = int(pd.Timedelta('12h').total_seconds()) # in seconds
scaler = {'identity': FunctionTransformer(lambda x: x), 'standard': StandardScaler()}
df_transformed = pd.DataFrame(scaler['identity'].fit_transform(df))
df_transformed.columns = df.columns.copy()
df_transformed.index = df.index.copy()
# Number of rows for rolling corresponding to the specified lag
nroll = int(pd.Timedelta(str(lag/sampling_period)+'S').total_seconds())+1
# Numpy array for autoregression
X = np.lib.stride_tricks.sliding_window_view(df_transformed[feature], nroll)
print(X)

In [None]:
## Function for component-plotting a vector (that is, a time series sample)
def plotveccomp(x, xlab, ylab, title, axis = None):
  ax = axis
  component_index = range(0, len(x))
  ax.plot(component_index, x, color = 'black', marker = '')
  ax.plot(component_index, [np.mean(x)]*len(x), linewidth = 1, linestyle = 'dashed', color ='blue')
  ax.plot(component_index, [np.mean(x) - np.std(x)]*len(x), linewidth = 1, linestyle = 'dashed', color ='red')
  ax.plot(component_index, [np.mean(x) + np.std(x)]*len(x), linewidth = 1, linestyle = 'dashed', color ='red')
  ax.xaxis.set_minor_locator(dates.HourLocator(interval=4))
  #ax.xaxis.set_minor_formatter(dates.DateFormatter('%H:%M'))  # hours and minutes
  ax.set_xlabel(xlab)
  ax.set_ylabel(ylab)
  ax.set_title(title)