# Anomaly Detection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#load dataset
df = pd.read_csv(r"E:\MLIoT\ML\dataset\Anomaly Detection\eecs498\conn250K.csv",header=None)
df.columns=['recordID','duration','src_bytes','dst_bytes']
df.shape

(256670, 4)

    - record ID - The unique identifier for each connection record.
    duration_  This feature denotes the number of seconds (rounded) of the connection. For example, a connection for 0.17s or 0.3s would be indicated with a “0” in this field.
    - src_bytes This field represents the number of data bytes transferred from the source to the destination (i.e., the amount of out-going bytes from the host).
    - dst_bytes This feature represents the number of data bytes transferred from the destination to the source (i.e., the amount of bytes received by the host).

In [3]:
df.head()

Unnamed: 0,recordID,duration,src_bytes,dst_bytes
0,1,0,236,1228
1,2,0,239,486
2,3,0,234,1364
3,4,0,239,1295
4,5,0,181,5450


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256670 entries, 0 to 256669
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   recordID   256670 non-null  int64
 1   duration   256670 non-null  int64
 2   src_bytes  256670 non-null  int64
 3   dst_bytes  256670 non-null  int64
dtypes: int64(4)
memory usage: 7.8 MB


In [5]:
df.describe()

Unnamed: 0,recordID,duration,src_bytes,dst_bytes
count,256670.0,256670.0,256670.0,256670.0
mean,128335.5,0.640936,474.247898,4473.533
std,74094.391134,15.62717,3391.106084,20809.03
min,1.0,0.0,0.0,0.0
25%,64168.25,0.0,217.0,660.0
50%,128335.5,0.0,249.0,1661.0
75%,192502.75,0.0,306.0,4341.0
max,256670.0,3289.0,54540.0,7068759.0


## Data Cleaning

In [6]:
df.duplicated().sum()

0

In [7]:
df.isnull().sum()

recordID     0
duration     0
src_bytes    0
dst_bytes    0
dtype: int64

In [8]:
df2 = df.copy()
df.drop(['recordID'],axis=1,inplace=True)

## APply Anomaly Detection

In [9]:
from sklearn.ensemble import IsolationForest
model = IsolationForest(n_estimators=100,max_samples=0.8,contamination=0.05,random_state=5)

In [10]:
model.fit(df)

IsolationForest(contamination=0.05, max_samples=0.8, random_state=5)

In [11]:
df['is_anamoly?_'] = model.predict(df)
df['is_anamoly?_'].value_counts()

 1    243836
-1     12834
Name: is_anamoly?_, dtype: int64

In [16]:
df2['is_anomaly?_'] = np.where(df['is_anamoly?_']==-1,1,0)
df2['is_anomaly?_'].value_counts()

0    243836
1     12834
Name: is_anomaly?_, dtype: int64

In [17]:
df2.head()

Unnamed: 0,recordID,duration,src_bytes,dst_bytes,is_anamoly?_,is_anomaly?_
0,1,0,236,1228,0,0
1,2,0,239,486,0,0
2,3,0,234,1364,0,0
3,4,0,239,1295,0,0
4,5,0,181,5450,0,0


In [22]:
df2 = df2[['recordID','is_anomaly?_']]
df2.rename({'recordID':'ID','is_anomaly?_':"is_anomaly?"},axis=1,inplace=True)
df2.to_csv("results.csv",index=False)

In [23]:
df2.head()

Unnamed: 0,ID,is_anomaly?
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
