In [None]:
# Mounting
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import Pip Library for Anomaly Detection
!pip install outlier_utils

In [None]:
# Import Other Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Improt data
file_path = '/content/drive/MyDrive/INFO-614 Data Mining/HW5/server.csv'
df = pd.read_csv(file_path)

# Display Head of the Imported Data
df.head()

In [None]:
# Graphical Approach 
import seaborn as sns
sns.displot(df, x="latency", binwidth=0.025)

# Grubbs' Test

In [None]:
# Pre-processing
from sklearn.preprocessing import MinMaxScaler
data = df[['latency']]

# Scaling
scaler = MinMaxScaler()
df_scale = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)
df_scale['latency'] = df_scale['latency'].astype('float')
data = df_scale.to_numpy()  # convert df_scale into an array
data

In [None]:
# Function for Grubbs' Test
import scipy.stats as stats

def grubbs_stat(y):
    std_dev = np.std(y)
    avg_y = np.mean(y)
    abs_val_minus_avg = abs(y - avg_y)
    max_of_deviations = max(abs_val_minus_avg)
    max_ind = np.argmax(abs_val_minus_avg)
    Gcal = max_of_deviations/ std_dev
    print("Grubbs Statistics Value : {}".format(Gcal))
    return Gcal, max_ind

def calculate_critical_value(size, alpha):
    t_dist = stats.t.ppf(1 - alpha / (2 * size), size - 2)
    numerator = (size - 1) * np.sqrt(np.square(t_dist))
    denominator = np.sqrt(size) * np.sqrt(size - 2 + np.square(t_dist))
    critical_value = numerator / denominator
    print("Grubbs Critical Value: {}".format(critical_value))
    return critical_value

In [None]:
# Result
grubbs_stat(data)

# KNN Anomaly Detection Approach 

In [None]:
from sklearn.preprocessing import MinMaxScaler
data = df[['latency','throughput']]

# Scaling
scaler = MinMaxScaler()
df_scale = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)
plt.scatter(df_scale["latency"], df_scale["throughput"])
X = df_scale.values

In [None]:
from sklearn.neighbors import NearestNeighbors
# instantiate model
nbrs = NearestNeighbors(n_neighbors = 3)
# fit model
nbrs.fit(X)
# distances and indexes of k-neaighbors from model outputs
distances, indexes = nbrs.kneighbors(X)
# plot mean of k-distances of each observation
plt.plot(distances.mean(axis =1))

In [None]:
# visually determine cutoff values > 0.09
outlier_index = np.where(distances.mean(axis = 1) > 0.09)
outlier_index

In [None]:
# filter outlier values
outlier_values = df_scale.iloc[outlier_index]
outlier_values

In [None]:
# plot data
plt.scatter(df_scale["latency"], df_scale["throughput"], color = "b", s = 65)
# plot outlier values
plt.scatter(outlier_values["latency"], outlier_values["throughput"], color = "r")

# DBSCAN for Anomaly Detection

In [None]:
# Scaling and Visualization
data = df[['latency','throughput']]
scaler = MinMaxScaler()
df_scale = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)
plt.scatter(df_scale["latency"], df_scale["throughput"])
X = df_scale.values

In [None]:
# Model Result
from sklearn.cluster import DBSCAN

# specify & fit model
model = DBSCAN(eps = 0.08, min_samples = 3).fit(df_scale)

# visualize outputs
colors = model.labels_
plt.scatter(df_scale["latency"], df_scale["throughput"], c = colors)