# Anomaly Detection Using Gaussian Distribution

In [4]:
####################################################################################################
####### The original method is a neat method which seems to work fine for small data sets ##########
#######   but fails for large or real world data sets where there are lotsa variables. Hence  ######
#######   I have modified it.  So what I do is to select only the Numeric features in the data set
#######   and feed the Features to this. However you have to make sure you "normalize" the features.
####################################################################################################

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy import genfromtxt
from scipy.stats import multivariate_normal
from sklearn.metrics import f1_score
import warnings 
warnings.filterwarnings("ignore")

In [6]:
########################## Enter the input Files and the Target Variable here ##################
sep = ','   ##### Enter the type of Separator in your CSV file. Could be Comma, Tab, etc.
target = '__________'  #### Enter the name of the Target Variable here
tr_file = '___________________.csv'   ###### Enter the name of the Entire Data Set here. The ###
######## Program will automatically split it into tr_data, cv_data and gt_data files. ##########

In [7]:
plt.style.use('ggplot')
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

In [36]:
def read_dataset(filePath,delimiter=','):
    return genfromtxt(filePath, delimiter=delimiter)

def feature_normalize(dataset):
    mu = np.mean(dataset,axis=0)
    sigma = np.std(dataset,axis=0)
    return (dataset - mu)/sigma

def estimateGaussian(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.cov(dataset.T)
    return mu, sigma
    
def multivariateGaussian(dataset,mu,sigma):
    p = multivariate_normal(mean=mu, cov=sigma)
    return p.pdf(dataset)

def select_num_columns(dataset):
    cols_list = []
    for col in dataset.columns:
        if dataset[col].dtype=='O' or len(dataset[col].value_counts())<=2:
            print('   Dropping Column: %s' %col)
            continue
        else:
            cols_list.append(col)
    return dataset[cols_list], cols_list

def selectThresholdByCV(probs,gt):
    best_epsilon = 0
    best_f1 = 0
    f = 0
    stepsize = (max(probs) - min(probs)) / 1000;
    epsilons = np.arange(min(probs),max(probs),stepsize)
    for epsilon in np.nditer(epsilons):
        predictions = (probs < epsilon) 
        f = f1_score(gt, predictions,average='binary')
        if f > best_f1:
            best_f1 = f
            best_epsilon = epsilon
    
    return best_f1, best_epsilon

In [40]:
import copy
import pdb
def load_data(trainfile, sep, target, cvfile=None,gtfile=None):
    codes_list = ['utf-8','iso-8859-1','cp1252','latin1']
    for codex in codes_list:
        try:
            tr_data = pd.read_csv(trainfile,sep=sep,encoding=codex,index_col=None)
        except:
            continue
    nrows = int(tr_data.shape[0]/3)
    print('Number of datapoints in training set: %d' % (tr_data.shape[0]))
    print('Number of dimensions/features: %d' % (tr_data.shape[1]))
    nrows2 = nrows*2
    print('  Selecting only those dimensions/features that are numeric...')
    for g, df in tr_data.groupby(np.arange(len(tr_data)) // nrows):  
        if g == 0:
            tr_data = copy.deepcopy(df)
            #pdb.set_trace()
            if tr_data[target].dtype=='O':
                tr_data[target] = tr_data[target].apply(lambda x: 1 if x=='Y' else 0)
            zero_mask = tr_data[target]==0
            tr_data = tr_data[zero_mask]
            preds = [x for x in tr_data.columns if x not in [target]]
            tr_data = tr_data[preds]
            tr_data, cols = select_num_columns(tr_data)
            tr_data = feature_normalize(tr_data)
        elif g == 1:
            cv_data = copy.deepcopy(df)
            if cv_data[target].dtype=='O':
                cv_data[target] = cv_data[target].apply(lambda x: 1 if x=='Y' else 0)
            zero_mask = cv_data[target]==0
            preds = [x for x in cv_data.columns if x not in [target]]
            gt_data = cv_data[target]
            cv_data = cv_data[preds]
            cv_data = cv_data[cols]
            cv_data = feature_normalize(cv_data)
        elif g == 2:
            test_data = copy.deepcopy(df)
    print('\nTraining Data', tr_data.shape)
    print('\nCV Data', cv_data.shape)
    print('\nGlobal Truths Data', gt_data.shape)

    return tr_data, cv_data, gt_data, test_data  

tr_data, cv_data, gt_data, test_data = load_data(tr_file,sep,target)

In [41]:
mu, sigma = estimateGaussian(np.asarray(tr_data))
tr_data = np.asarray(tr_data)
tr_data[:5],tr_data.shape

In [42]:
tr_data.head(3).T

In [31]:
p = multivariateGaussian(np.asarray(tr_data),mu,sigma)
p[:5]

array([  1.27908290e-16,   3.82082861e-15,   7.66063381e-16,
         8.11703772e-16,   1.55512098e-15])

In [32]:
#selecting optimal value of epsilon using cross validation
p_cv = multivariateGaussian(cv_data,mu,sigma)
fscore, ep = selectThresholdByCV(p_cv,gt_data)
print(fscore, 'epsilon=%0.10f' %ep)

(0.0045001125028125699, 'epsilon=0.0000000000')


In [33]:
#selecting outlier datapoints 
outliers = np.asarray(np.where(p < ep))
outliers.shape

(1L, 12658L)

In [34]:
outliers

array([[    0,     1,     2, ..., 34541, 34545, 34552]], dtype=int64)