# Anomaly Detection Notebook: Chapter 1
## Date Started:   8 August 2022
## Latest Update: 13 June 2024

## 0. Define Metadata / Main Variables

In [1]:
# Define the name of the target class column here instead of manually typing it out everywhere
target_class_name = 6

# Fill in the names of what you want to call the 0 and 1 class
labels = ['inliers', 'outliers']

# Add directory as a string
rawdataDirectory = "../01-Data/Raw/"
dataDirectory = "../01-Data/"
dataFile = "thyroid.mat"

# Any exported artifacts will have this date
export_date = '202406'

### Key Variables
#### thyroidX - Individual test results
#### thyroidY - Final diagnostic result - Presence or absence of cancer 

## 1. Download data to directory - Done

## 2. Import relevant packages

In [2]:
import sklearn 
import scipy
from scipy import io
import numpy as np

import matplotlib 
%matplotlib inline

import pandas as pd
import dask

from scipy.io.arff import loadarff
import scipy.io as sio

from collections import Counter
from sklearn.preprocessing import MinMaxScaler

from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

from sklearn.metrics import accuracy_score, classification_report,confusion_matrix 
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, average_precision_score


## 3. Load Dataset

In [3]:
dataFile = rawdataDirectory + dataFile
print (dataFile)

../01-Data/Raw/thyroid.mat


In [4]:
# Load to dictionary
thyroidD = scipy.io.loadmat(dataFile)

## 4. Extract X and Y and Concatenate

### 4.1 Explore Dataset as Dictionary

In [5]:
# No of key-value pairs
len(thyroidD)

5

In [6]:
# "Iterating over a dictionary produces its' keys"
# The numeric data are stored in the values "X" and "y"
# Note: The value for X is a list of lists for diagnostic information
#       The value for y is a list of lists (length 1) giving the final diagnosis
[i for i in thyroidD]

['__header__', '__version__', '__globals__', 'X', 'y']

In [7]:
# Loop through both keys and values, by using the items() function:
# for i, j in thyroidD.items():
#  print(i, j)
thyroidD

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-05 13:11:25 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[7.74193548e-01, 1.13207547e-03, 1.37571157e-01, 2.75700935e-01,
         2.95774648e-01, 2.36065574e-01],
        [2.47311828e-01, 4.71698113e-04, 2.79886148e-01, 3.29439252e-01,
         5.35211268e-01, 1.73770492e-01],
        [4.94623656e-01, 3.58490566e-03, 2.22960152e-01, 2.33644860e-01,
         5.25821596e-01, 1.24590164e-01],
        ...,
        [9.35483871e-01, 2.45283019e-02, 1.60341556e-01, 2.82710280e-01,
         3.75586854e-01, 2.00000000e-01],
        [6.77419355e-01, 1.47169811e-03, 1.90702087e-01, 2.42990654e-01,
         3.23943662e-01, 1.95081967e-01],
        [4.83870968e-01, 3.56603774e-03, 1.90702087e-01, 2.12616822e-01,
         3.38028169e-01, 1.63934426e-01]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]])}

In [8]:
thyroidHdr = thyroidD["__header__"]
thyroidVrsn = thyroidD["__version__"]
thyroidGlobals = thyroidD["__globals__"]

print("thyroidHdr = ", thyroidHdr,"\n",
      "thyroidVrsn = ", thyroidVrsn,"\n",
      "thyroidGlobals = ", thyroidGlobals,"\n")

thyroidHdr =  b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-05 13:11:25 UTC' 
 thyroidVrsn =  1.0 
 thyroidGlobals =  [] 



In [9]:
items = list(thyroidD.items())
for key, value in items:
    print(f"Key: {key}, Value: {value}")

Key: __header__, Value: b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-05 13:11:25 UTC'
Key: __version__, Value: 1.0
Key: __globals__, Value: []
Key: X, Value: [[7.74193548e-01 1.13207547e-03 1.37571157e-01 2.75700935e-01
  2.95774648e-01 2.36065574e-01]
 [2.47311828e-01 4.71698113e-04 2.79886148e-01 3.29439252e-01
  5.35211268e-01 1.73770492e-01]
 [4.94623656e-01 3.58490566e-03 2.22960152e-01 2.33644860e-01
  5.25821596e-01 1.24590164e-01]
 ...
 [9.35483871e-01 2.45283019e-02 1.60341556e-01 2.82710280e-01
  3.75586854e-01 2.00000000e-01]
 [6.77419355e-01 1.47169811e-03 1.90702087e-01 2.42990654e-01
  3.23943662e-01 1.95081967e-01]
 [4.83870968e-01 3.56603774e-03 1.90702087e-01 2.12616822e-01
  3.38028169e-01 1.63934426e-01]]
Key: y, Value: [[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


### 4.2 Extract X and y

In [10]:
thyroid_X = pd.DataFrame(thyroidD["X"])
print(type(thyroid_X))
thyroid_X

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,0,1,2,3,4,5
0,0.774194,0.001132,0.137571,0.275701,0.295775,0.236066
1,0.247312,0.000472,0.279886,0.329439,0.535211,0.173770
2,0.494624,0.003585,0.222960,0.233645,0.525822,0.124590
3,0.677419,0.001698,0.156546,0.175234,0.333333,0.136066
4,0.236559,0.000472,0.241935,0.320093,0.333333,0.247541
...,...,...,...,...,...,...
3767,0.817204,0.000113,0.190702,0.287383,0.413146,0.188525
3768,0.430108,0.002453,0.232448,0.287383,0.446009,0.175410
3769,0.935484,0.024528,0.160342,0.282710,0.375587,0.200000
3770,0.677419,0.001472,0.190702,0.242991,0.323944,0.195082


In [11]:
pd.DataFrame.info(thyroid_X)
pd.DataFrame.describe(thyroid_X)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3772 non-null   float64
 1   1       3772 non-null   float64
 2   2       3772 non-null   float64
 3   3       3772 non-null   float64
 4   4       3772 non-null   float64
 5   5       3772 non-null   float64
dtypes: float64(6)
memory usage: 176.9 KB


Unnamed: 0,0,1,2,3,4,5
count,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0
mean,0.543121,0.008983,0.186826,0.248332,0.376941,0.177301
std,0.20379,0.043978,0.070405,0.080579,0.087382,0.054907
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.376344,0.001132,0.156546,0.203271,0.328638,0.14918
50%,0.569892,0.003019,0.190702,0.241822,0.375587,0.17377
75%,0.709677,0.004528,0.213472,0.28271,0.413146,0.196721
max,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
thyroid_y = pd.DataFrame(thyroidD["y"])
thyroid_y

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
3767,0.0
3768,0.0
3769,0.0
3770,0.0


In [13]:
pd.DataFrame.info(thyroid_y)
pd.DataFrame.describe(thyroid_y)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3772 non-null   float64
dtypes: float64(1)
memory usage: 29.6 KB


Unnamed: 0,0
count,3772.0
mean,0.024655
std,0.155093
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


### 4.3 Concatenate the DataFrames and Inspect

In [14]:
# Concatenate with the diagnosis at the end of the dataframe
thyroidDF = pd.concat([thyroid_X, thyroid_y], axis = 1)
pd.DataFrame.info(thyroidDF)
thyroidDF

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3772 non-null   float64
 1   1       3772 non-null   float64
 2   2       3772 non-null   float64
 3   3       3772 non-null   float64
 4   4       3772 non-null   float64
 5   5       3772 non-null   float64
 6   0       3772 non-null   float64
dtypes: float64(7)
memory usage: 206.4 KB


Unnamed: 0,0,1,2,3,4,5,0.1
0,0.774194,0.001132,0.137571,0.275701,0.295775,0.236066,0.0
1,0.247312,0.000472,0.279886,0.329439,0.535211,0.173770,0.0
2,0.494624,0.003585,0.222960,0.233645,0.525822,0.124590,0.0
3,0.677419,0.001698,0.156546,0.175234,0.333333,0.136066,0.0
4,0.236559,0.000472,0.241935,0.320093,0.333333,0.247541,0.0
...,...,...,...,...,...,...,...
3767,0.817204,0.000113,0.190702,0.287383,0.413146,0.188525,0.0
3768,0.430108,0.002453,0.232448,0.287383,0.446009,0.175410,0.0
3769,0.935484,0.024528,0.160342,0.282710,0.375587,0.200000,0.0
3770,0.677419,0.001472,0.190702,0.242991,0.323944,0.195082,0.0


#### Quick overview: No columns appear to need cleaning. Now inspect rows

In [15]:
# Random sample of 5 records
thyroidDF.sample(5)

Unnamed: 0,0,1,2,3,4,5,0.1
1584,0.795699,0.018868,0.147059,0.205607,0.328638,0.162295,0.0
1832,0.731183,0.002264,0.213472,0.252336,0.43662,0.157377,0.0
2801,0.827957,0.002075,0.166034,0.224299,0.446009,0.136066,0.0
1927,0.494624,0.001283,0.166034,0.238318,0.328638,0.188525,0.0
1412,0.72043,0.001792,0.185009,0.25,0.450704,0.15082,0.0


Lets check the head & tail to make sure there is nothing going on at the last row or the header

In [16]:
thyroidDF.head(3)

Unnamed: 0,0,1,2,3,4,5,0.1
0,0.774194,0.001132,0.137571,0.275701,0.295775,0.236066,0.0
1,0.247312,0.000472,0.279886,0.329439,0.535211,0.17377,0.0
2,0.494624,0.003585,0.22296,0.233645,0.525822,0.12459,0.0


In [17]:
thyroidDF.tail(3)

Unnamed: 0,0,1,2,3,4,5,0.1
3769,0.935484,0.024528,0.160342,0.28271,0.375587,0.2,0.0
3770,0.677419,0.001472,0.190702,0.242991,0.323944,0.195082,0.0
3771,0.483871,0.003566,0.190702,0.212617,0.338028,0.163934,0.0


### Data Insights

In [18]:
thyroidDF.describe()
# Looks like all the numbers are between 0 and 1

Unnamed: 0,0,1,2,3,4,5,0.1
count,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0
mean,0.543121,0.008983,0.186826,0.248332,0.376941,0.177301,0.024655
std,0.20379,0.043978,0.070405,0.080579,0.087382,0.054907,0.155093
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.376344,0.001132,0.156546,0.203271,0.328638,0.14918,0.0
50%,0.569892,0.003019,0.190702,0.241822,0.375587,0.17377,0.0
75%,0.709677,0.004528,0.213472,0.28271,0.413146,0.196721,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Unique Value Checking

In [21]:
for col in thyroidDF.columns:
    print(col, len(thyroidDF[col].unique()))

AttributeError: 'DataFrame' object has no attribute 'unique'

#### All of the columns have a relatively small number of duplicate values. Except the last one. As the final diagnosis this should always be 0 or 1 

### 4.4 Export DataFrame as csv

In [20]:
dataDirectory = "../01-Data/"
csvFile = "thyroid.csv"
csvdataFile = dataDirectory + csvFile
thyroidDF.to_csv(csvdataFile)