In [27]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.metrics import f1_score, recall_score, precision_score

from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')

import scipy.io

Dataset: http://odds.cs.stonybrook.edu/letter-recognition-dataset/

In [2]:
def import_data(filename='data/letter.mat'):
    """Return X and Y data from MATLAB data
    Given a MATLAB file, returns the data stored in
    variables X and Y in Pandas DataFrame format.
    
    Args:
        filename(str): Name of the file

    Returns:
        x_col(object): Pandas DataFrame containing
            values in variable X
        y_col(object): Pandas DataFrame containing
            values in variable Y
    """
    x_col = scipy.io.loadmat(filename)['X']
    y_col = scipy.io.loadmat(filename)['y']
    
    x_col = pd.DataFrame(x_col)
    y_col = pd.DataFrame(y_col)
    
    return [x_col, y_col]


def normalize_data(data):
    """Normalizes given data.
    For a given Pandas DataFrame, uses the StandardScaler
    implementation to fit to the data and normalize it.
    
    Args:
        data(object): Pandas DataFrame containing numerical
            data points to be normalized.

    Returns:
        norm_data(object): Normalized data
    """
    try:
        norm_data = StandardScaler().fit_transform(data)
        return norm_data
    except Exception as e:
        print(e)

    return


In [32]:
def get_f1_score(y_true, y_pred):
    """Get f1 score for outlier classification.
    
    Args:
        y_true(list): Original labels
        y_pred(list): New labels
        
    Returns:
        float: F1-score for classification
    
    """
    return round(f1_score(y_true, y_pred),2)
    
def transform_predictions_dbscan(labels, y_col):
    """Provide labels in sklearn format.
    Transforms the original labels and predictions
    from the model in list formatted so that it can
    be used with sklearn to calculate metrics.
    
    Args:
        labels(list): Original labels from the
            dataset
        y_col(list): Labels predicted by the model
    
    Returns:
        [list, list]: Two lists, one with the original
            labels for the dataset, and one with the 
            predicted labels
    """
    y_pred = [1 if x == -1 else 0 for x in labels]
    y_true = list(y_col.to_numpy().reshape(1,-1)[0])
    
    return [y_true, y_pred]


def transform_predictions_gmm(labels, y_col, threshold):
    """Provide labels in sklearn format.
    Transforms the original labels and predictions
    from the model in list formatted so that it can
    be used with sklearn to calculate metrics.
    
    Args:
        labels(list): Original labels from the
            dataset
        y_col(list): Labels predicted by the model
    
    Returns:
        [list, list]: Two lists, one with the original
            labels for the dataset, and one with the 
            predicted labels
    """
    y_pred = [1 if x < threshold else 0 for x in labels]
    y_true = list(y_col.to_numpy().reshape(1,-1)[0])
    
    return [y_true, y_pred]


def create_dbscan_model(data, eps, min_samples):
    """Create an sklearn DBSCAN model.
    """
    try:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples);
        model = dbscan.fit(data);
        return model.labels_
    except Exception as e:
        print(e)
        
def create_gmm_model(data, n_components, cv_type):
    """Create an sklearn Gaussian Mixture model.
    """
    try:
        from sklearn.mixture import GaussianMixture
        gmm = GaussianMixture(n_components=n_components,
                                      covariance_type=cv_type);
        gmm.fit(data);
        return gmm.score_samples(x_col)
    except Exception as e:
        print(e)
        

def create_knn_model(data, contamination, n_neighbors):
    """Create an sklearn Gaussian Mixture model.
    """
    try:
        from pyod.models.knn import KNN
        knn = KNN(contamination=contamination, n_neighbors=n_neighbors)
        knn.fit(x_col) 
        return knn.labels_
    except Exception as e:
        print(e)

In [28]:
# Import data from file
x_col, y_col = import_data(filename='data/letter.mat')
# Standardize all columns in the data
x_col = normalize_data(x_col)
# Run DBSCAN to categorize the models
labels = create_dbscan_model(x_col, 3, 4)
# Get f1_score from sklearn
[y_true, y_pred] = transform_predictions_dbscan(labels, y_col)
score = get_f1_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average=None)[1]
precision = precision_score(y_true, y_pred, average=None)[1]
print("F1-score %8.5f\nRecall %8.5f\nPrecision %8.5f" %(score, recall, precision))

F1-score  0.33000
Recall  0.91000
Precision  0.20088


In [30]:
# Import data from file
x_col, y_col = import_data(filename='data/letter.mat')
# Standardize all columns in the data
x_col = normalize_data(x_col)
# Run DBSCAN to categorize the models
labels = create_gmm_model(x_col, 26, 'spherical')
# Get f1_score from sklearn
[y_true, y_pred] = transform_predictions_gmm(labels, y_col, -40)
score = get_f1_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average=None)[1]
precision = precision_score(y_true, y_pred, average=None)[1]
print("F1-score %8.5f\nRecall %8.5f\nPrecision %8.5f" %(score, recall, precision))

F1-score  0.32000
Recall  0.65000
Precision  0.21036


In [51]:
# Import data from file
x_col, y_col = import_data(filename='data/letter.mat')
# Standardize all columns in the data
x_col = normalize_data(x_col)
# Guessing 10% are outliers performs better
y_pred = create_knn_model(x_col, 0.10, 5)
score = get_f1_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average=None)[1]
precision = precision_score(y_true, y_pred, average=None)[1]
print("F1-score %8.5f\nRecall %8.5f\nPrecision %8.5f" %(score, recall, precision))

F1-score  0.40000
Recall  0.52000
Precision  0.32500
