# Anomaly Detection Notebook
### Date Started:   11 June 2024
### Latest Update:  10 July 2024

## Chapter 2: One-Class SVM

## 1. Split Dataframe into Train and Test 

In [48]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.metrics import precision_score, recall_score, f1_score,  average_precision_score, confusion_matrix

import scipy
import matplotlib
import matplotlib.pyplot as plt
import seaborn

In [49]:
# Inserted at the top of every notebook to point to the target column
target_class_name = "6"
# And the names to call the 0 and 1 class
labels = ['inliers', 'outliers']

### Import Cleaned File

In [50]:
# Import
data_directory = "../01-Data/Processed/"
file_name = "thyroid.csv"      # Cleaned data exported from Week 1 notebook 
dfThyroid = pd.read_csv(data_directory  + file_name)

### Check, Split Train / Test and Extract Features and Targets

In [51]:
# Note: Previously this had an extra column at the front because the index column was not dropped on export to CSV  
pd.DataFrame.info(dfThyroid)
dfThyroid

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3772 non-null   float64
 1   1       3772 non-null   float64
 2   2       3772 non-null   float64
 3   3       3772 non-null   float64
 4   4       3772 non-null   float64
 5   5       3772 non-null   float64
 6   6       3772 non-null   float64
dtypes: float64(7)
memory usage: 206.4 KB


Unnamed: 0,0,1,2,3,4,5,6
0,0.774194,0.001132,0.137571,0.275701,0.295775,0.236066,0.0
1,0.247312,0.000472,0.279886,0.329439,0.535211,0.173770,0.0
2,0.494624,0.003585,0.222960,0.233645,0.525822,0.124590,0.0
3,0.677419,0.001698,0.156546,0.175234,0.333333,0.136066,0.0
4,0.236559,0.000472,0.241935,0.320093,0.333333,0.247541,0.0
...,...,...,...,...,...,...,...
3767,0.817204,0.000113,0.190702,0.287383,0.413146,0.188525,0.0
3768,0.430108,0.002453,0.232448,0.287383,0.446009,0.175410,0.0
3769,0.935484,0.024528,0.160342,0.282710,0.375587,0.200000,0.0
3770,0.677419,0.001472,0.190702,0.242991,0.323944,0.195082,0.0


In [52]:
# Convert 0 and 1 in the target column as float to integer
# Below line when column name assigned an integer 0 and not a char 
# dfThyroid.iloc[:, target_class_name] = dfThyroid.iloc[:, target_class_name].astype(int)  # pandas column is an integer 
# This line specifies a pandas column with char value
dfThyroid[target_class_name] = dfThyroid[target_class_name].astype(int)

In [53]:
# concatenate X_train with y_train. Then filter out the samples that we know are outliers
df_train = pd.concat([X_train, y_train], axis=1)
inlier_X_train = df_train[df_train[target_class_name]==0].drop(target_class_name, axis=1)

# We will use inlier_X_train as the input dataset
inlier_X_train.shape
# That makes sense to have 3017 - 74= 2943 samplespd.DataFrame.info(dfThyroid)
pd.DataFrame.describe(dfThyroid)

NameError: name 'X_train' is not defined

In [None]:
# Extracts features (X) and targets (y) 
X_dfThyroid = dfThyroid.drop(target_class_name, axis=1)
y_dfThyroid = dfThyroid[target_class_name]
# Prints feature details
print(X_dfThyroid.shape)
X_dfThyroid

In [None]:
# Prints target details
#  Note: This is now a series since it is a single column extracted from a dataframe 
print(y_dfThyroid.shape)
print(y_dfThyroid.info())
print(y_dfThyroid.describe())

### Randomly resamples train and test datasets while stratifying (keeping %) targets (y) 

In [None]:
# Randomly resamples train and test datasets while stratifying (keeping %) targets (y) 
X_train_dfThyroid, X_test_dfThyroid, y_train_dfThyroid, y_test_dfThyroid = train_test_split(X_dfThyroid, y_dfThyroid, test_size=0.2, random_state=42, stratify = y_dfThyroid)

In [None]:
# Check 
print("Training dataset features: \n")
pd.DataFrame.info(X_train_dfThyroid)
pd.DataFrame.describe(X_train_dfThyroid)

In [None]:
print("\nTesting dataset features")
pd.DataFrame.info(X_test_dfThyroid)
pd.DataFrame.describe(X_test_dfThyroid)

In [None]:
# Check 
print("\nTraining dataset targets")
print(y_train_dfThyroid.info())
print(y_train_dfThyroid.describe())

In [None]:
# Check 
print("\nTesting dataset targets")
print(y_test_dfThyroid.info())
print(y_test_dfThyroid.describe())# concatenate X_train with y_train. Then filter out the samples that we know are outliers
df_train = pd.concat([X_train, y_train], axis=1)
inlier_X_train = df_train[df_train[target_class_name]==0].drop(target_class_name, axis=1)

# We will use inlier_X_train as the input dataset
inlier_X_train.shape
# That makes sense to have 3017 - 74= 2943 samples

## 2. Quantify the number of outliers

#### On the full dataset

In [None]:
# Count the number of outliers
# outlier_count = dfThyroid[dfThyroid.iloc[:, target_class_name] == 1].shape[0]
outlier_count = dfThyroid[dfThyroid[target_class_name] == 1].shape[0]

print(f"Number of outliers in entire dataset: {outlier_count}")

#### For train and test datasets (split into series y_train and y_test)

In [None]:
# Count the number of outliers in Train and Test
# train_outlier_count = train_dfThyroid[train_dfThyroid.iloc[:, target_class_name] == 1].shape[0]
# train_outlier_count = y_train_dfThyroid[y_train_dfThyroid[target_class_name] == 1].shape[0]

# y_train_dfThyroid = y_train_dfThyroid[target_class_name].squeeze()   # This ensures that y_dfThyroid is a series 
train_outlier_count = (y_train_dfThyroid == 1).sum()
print(f"Number of outliers in training dataset: {train_outlier_count}")

# Works for dataframe and not for series
# test_outlier_count = test_dfThyroid[test_dfThyroid.iloc[:, target_class_name] == 1].shape[0]
test_outlier_count = (y_test_dfThyroid == 1).sum()
print(f"Number of outliers in test dataset: {test_outlier_count}")

## 3. Separate out samples corresponding to the inliers

#### Calculate Number of Outliers in the Entire Data Set 
##### Note: Targets are now floats rather than integers?? Check back whether this affects the calculations (checking equality of floats is problematic in Python)

In [None]:
# Separate training inliers and outliers records 
# total_inliers = dfThyroid[dfThyroid.iloc[:, target_class_name] == 0]
# total_outliers = dfThyroid[dfThyroid.iloc[:, target_class_name] == 1]

total_inliers = dfThyroid[dfThyroid[target_class_name] == 0]
total_outliers = dfThyroid[dfThyroid[target_class_name] == 1]

# Display the shape of inliers and outliers DataFrames
print(f"Number of training inliers: {total_inliers.shape[0]}")
print(f"Number of training outliers: {total_outliers.shape[0]}")

#### Calculate Propoprtion of Outliers 
##### Note: Targets are now floats rather than integers?? Check back whether this affects the calculations (checking equality of floats is problematic in Python)

In [None]:
outliers_fraction = total_outliers.shape[0] / (dfThyroid.shape[0]) 
inliers_fraction =  total_inliers.shape[0] / (dfThyroid.shape[0])

#### Calculate Percentage of Outliers and thus Baseline to Beat - Class Distributions 
##### Note: Targets are now floats rather than integers?? Check back whether this affects the calculations (checking equality of floats is problematic in Python)

In [None]:
# Display the shape of inliers and outliers DataFrames
# Note: Adding % to the format string converts a proportion to a percentage
print(f"\nInlier percentage is: {inliers_fraction:.3%}")
print(f"Outlier percentage is: {outliers_fraction:.3%}")

# NOT assigning the rounded value to the outliers_fraction - leaving as calculated
# outliers_fraction= 0.025 # Almost the calculated value

#We will use a multiple of this as the nu or contamination factor to tell the model how many to expect

In [None]:
plt.figure(figsize=(6,4))
pd.value_counts(dfThyroid[target_class_name]).plot.bar()
plt.title('Histogram of class distributions')
plt.xlabel(labels[1])
plt.xticks(rotation=0)
plt.ylabel('Frequency')
dfThyroid[target_class_name].value_counts()

## 4. Instantiate three one-class SVM 

### Prepare Training Data

In [None]:
# concatenate X_train with y_train to be able to identify the training samples that are outliers 
#   Then in the same step filter the outliers and drop the targets
train_dfThyroid = pd.concat([X_train_dfThyroid, y_train_dfThyroid], axis=1)
X_inliers = train_dfThyroid[train_dfThyroid[target_class_name]==0].drop(target_class_name, axis=1)

# We will use inlier_X_train as the input dataset
X_inliers.shape
# That makes sense to have 3017 - 74= 2943 samples

X_inliers.describe()

In [None]:
svm_rbf = OneClassSVM(kernel='rbf',nu=outliers_fraction*4)               # radial basis kernel
svm_poly2 = OneClassSVM(kernel='poly', degree=2, nu=outliers_fraction*4)
svm_poly3 = OneClassSVM(kernel='poly', degree=3, nu=outliers_fraction*4)

# Display the instantiated models
print(svm_rbf)
print(svm_poly2)
print(svm_poly3)

In [None]:
# Set up df with metrics

# Initialize the DataFrame with the required columns
metrics_df = pd.DataFrame(columns=[
    'Model', 
    'Precision', 
    'Recall', 
    'F1 Score', 
    'Average Precision', 
    'TN', 
    'TP', 
    'FN', 
    'FP'
])

# Display the initialized DataFrame
print(metrics_df)

## 5. Fit each of the three models

In [None]:
# We have already dropped the target (6) column when assigning X_inliers (target dataset)
#   As noted earlier we only train the model on the inliers

# Fit the models
svm_rbf.fit(X_inliers)
svm_poly2.fit(X_inliers)
svm_poly3.fit(X_inliers)

print("Models have been fitted successfully.")

## 6. Evaluate on the Test Features and Store in a Separate DataFrame

In [None]:
# Separate the test features and labels
#   Test performance on full (inlier+outlier) dataset
X_test = X_test_dfThyroid
y_test = y_test_dfThyroid

In [None]:
# List of models
models = [
    ('RBF', svm_rbf),
    ('Poly Degree 2', svm_poly2),
    ('Poly Degree 3', svm_poly3)
]

In [None]:

# Evaluate each model
for model_name, model in models:
    # Predict using the fitted model
    y_pred = model.predict(X_test)
    
    # Map predictions to binary values (inliers = 1, outliers = -1)
    y_pred = [1 if x == 1 else 0 for x in y_pred]

    # Calculate classification metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    avg_precision = average_precision_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Append metrics to the DataFrame

    # Create a DataFrame with the new row
    new_row = pd.DataFrame([{
        'Model': model_name,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Average Precision': avg_precision,
        'TN': tn,
        'TP': tp,
        'FN': fn,
        'FP': fp
    }])

    # Append the new row to the metrics DataFrame
    metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

# Display the metrics DataFrame
print(metrics_df)

# Copy to a separate dataframe
dfMetrics_OneClassSVM = metrics_df
print(dfMetrics_OneClassSVM)
      

## 7. Inspect Metrics and Compare Performance 

In [None]:
# Plot Precision, Recall, and F1 Score
metrics_df.set_index('Model')[['Precision', 'Recall', 'F1 Score', 'Average Precision']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Classification Metrics')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.legend(loc='best')
plt.show()

# Plot confusion matrix components (TN, TP, FN, FP)
metrics_df.set_index('Model')[['TN', 'TP', 'FN', 'FP']].plot(kind='bar', figsize=(10, 6))
plt.title('Comparison of Confusion Matrix Components')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(loc='best')
plt.show()
