## <font color = darkblue> This example shows three methods for outlier detection using multile dimensions (features)
    - It uses Nutrition data


In [1]:
# -----------------------------------
# Importing the necessary libraries
# -----------------------------------

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler

# Libraries related to outlier detection
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope


import seaborn as sns
import warnings
from datetime import datetime
warnings.filterwarnings('ignore') 
sns.set(rc={'figure.figsize':(11,8)})
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.options.display.float_format = '{:.2f}'.format



## Importing nutrition data

In [2]:
%cd "/Volumes/LACIE SHARE/Courses/Roux /Machine Learning/Data"
nut = pd.read_csv("Final_Nutrition.csv")


# ---------------------------------------------------------------------------------
# Keeping first 20 features for demonstration purposes
# ---------------------------------------------------------------------------------

nut = nut.iloc[:,range(20)]

print(nut.shape)

nut.head()

[Errno 2] No such file or directory: '/Volumes/LACIE SHARE/Courses/Roux /Machine Learning/Data'
/Users/ZongyuWu/PycharmProjects/CS6140
(8789, 20)


Unnamed: 0,NDB_No,Shrt_Desc,Long_Desc,FdGrp_Desc,Water_g,Energ_Kcal,Protein_g,Lipid_Tot_g,Carbohydrt_g,Fiber_TD_g,Sugar_Tot_g,Calcium_mg,Iron_mg,Magnesium_mg,Phosphorus_mg,Potassium_mg,Sodium_mg,Zinc_mg,Copper_mg,Manganese_mg
0,1001,"BUTTER,WITH SALT","Butter, salted",Dairy and Egg Products,15.87,717,0.85,81.11,0.06,0.0,0.06,24.0,0.02,2.0,24.0,24.0,643.0,0.09,0.0,0.0
1,1002,"BUTTER,WHIPPED,W/ SALT","Butter, whipped, with salt",Dairy and Egg Products,16.72,718,0.49,78.3,2.87,0.0,0.06,23.0,0.05,1.0,24.0,41.0,583.0,0.05,0.01,0.0
2,1003,"BUTTER OIL,ANHYDROUS","Butter oil, anhydrous",Dairy and Egg Products,0.24,876,0.28,99.48,0.0,0.0,0.0,4.0,0.0,0.0,3.0,5.0,2.0,0.01,0.0,0.0
3,1004,"CHEESE,BLUE","Cheese, blue",Dairy and Egg Products,42.41,353,21.4,28.74,2.34,0.0,0.5,528.0,0.31,23.0,387.0,256.0,1146.0,2.66,0.04,0.01
4,1005,"CHEESE,BRICK","Cheese, brick",Dairy and Egg Products,41.11,371,23.24,29.68,2.79,0.0,0.51,674.0,0.43,24.0,451.0,136.0,560.0,2.6,0.02,0.01


## Outlier detection using Local Outlier Factor (LOF) method
- This method uses KNN

In [3]:
# -----------------------------------------------------------------------------
# Step 1
# Select a few important numerical features for outlier detection
# Make sure to avoid using Response variable (if one already exists)
# -----------------------------------------------------------------------------

num_cols = ['Protein_g', 'Lipid_Tot_g','Carbohydrt_g','Fiber_TD_g','Sugar_Tot_g','Calcium_mg','Iron_mg','Magnesium_mg']

# -----------------------------------------------------------------------------
# Step 2
# At this stage, either drop NAs or impute them with a value
# I have shown filling NAs with 0, as it seems approproate in this example            
# -----------------------------------------------------------------------------

X = nut[num_cols].fillna(0).values

# -----------------------------------------------------------------------------
# Step 3a
# fit the Local Outlier Factor model (based on KNN)
# Notice the contamination parameter to identify a certain proportion of outliers
# -----------------------------------------------------------------------------

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
# predict the labels for each data point (as Outlier or inlier)
y_pred_lof = lof.fit_predict(X)

# -----------------------------------------------------------------------------
# Step 3b
# fit the Isolation Forest outlier detection (based on decision trees)
# -----------------------------------------------------------------------------
iforest = IsolationForest(n_estimators=100,  contamination=0.05)
# predict the labels for each data point (as Outlier or inlier)
y_pred_if = iforest.fit_predict(X)


# -----------------------------------------------------------------------------
# Step 3c
# fit the robust covariance model (based on Mahalanobis distance)
# -----------------------------------------------------------------------------
rob_cov = EllipticEnvelope(contamination=0.05)
rob_cov.fit(X)

# predict the labels for each data point (as Outlier or inlier)
y_pred_rob = rob_cov.predict(X)

# -----------------------------------------------------------------------------
# Adding the newly created columns to the nutrotion table             
# -----------------------------------------------------------------------------
nut["y_pred_lof"] = y_pred_lof
nut["y_pred_if"] = y_pred_if
nut["y_pred_rob"] = y_pred_rob

# -----------------------------------------------------------------------------
# Converting them to a binary -1, 0. 
# Where -1 denotes outlier
# The purpose is to then add these columns and find out which rows were identified as outliers from multiple methods
# -----------------------------------------------------------------------------
nut["y_pred_lof_2"] = np.where(nut["y_pred_lof"]<0, -1, 0)
nut["y_pred_if_2"] = np.where(nut["y_pred_if"]<0, -1, 0)
nut["y_pred_rob_2"] = np.where(nut["y_pred_rob"]<0, -1, 0)




## Summing the outlier status 

In [4]:
nut.iloc[:,-3:]
pd.crosstab(nut["y_pred_if"], nut["y_pred_rob"] )

nut["all_out"] = nut.loc[:,["y_pred_if_2","y_pred_rob_2","y_pred_lof_2"]].sum(axis = 1)
nut["all_out"].value_counts()

# -----------------------------------------------------------------------------
# List of food items identified as outliers based by at least two methods
# -----------------------------------------------------------------------------
nut[nut["all_out"]<-1]

Unnamed: 0,NDB_No,Shrt_Desc,Long_Desc,FdGrp_Desc,Water_g,Energ_Kcal,Protein_g,Lipid_Tot_g,Carbohydrt_g,Fiber_TD_g,...,Zinc_mg,Copper_mg,Manganese_mg,y_pred_lof,y_pred_if,y_pred_rob,y_pred_lof_2,y_pred_if_2,y_pred_rob_2,all_out
32,1033,"CHEESE,PARMESAN,HARD","Cheese, parmesan, hard",Dairy and Egg Products,29.16,392,35.75,25.83,3.22,0.00,...,2.75,0.03,0.02,1,-1,-1,0,-1,-1,-2
82,1090,"MILK,DRY,WHL,W/ ADDED VITAMIN D","Milk, dry, whole, with added vitamin D",Dairy and Egg Products,2.47,496,26.32,26.71,38.42,0.00,...,3.34,0.08,0.04,1,-1,-1,0,-1,-1,-2
83,1091,"MILK,DRY,NONFAT,REG,WO/ ADDED VIT A & VITAMIN D","Milk, dry, nonfat, regular, without added vita...",Dairy and Egg Products,3.16,362,36.16,0.77,51.98,0.00,...,4.08,0.04,0.02,1,-1,-1,0,-1,-1,-2
84,1092,"MILK,DRY,NONFAT,INST,W/ ADDED VIT A & VITAMIN D","Milk, dry, nonfat, instant, with added vitamin...",Dairy and Egg Products,3.96,358,35.10,0.72,52.19,0.00,...,4.41,0.04,0.02,1,-1,-1,0,-1,-1,-2
86,1094,"MILK,BUTTERMILK,DRIED","Milk, buttermilk, dried",Dairy and Egg Products,2.97,387,34.30,5.78,49.00,0.00,...,4.02,0.11,0.02,1,-1,-1,0,-1,-1,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8707,43340,"CHEESE,PARMESAN,LO NA","Cheese, parmesan, low sodium",Dairy and Egg Products,22.20,451,41.60,29.99,3.70,0.00,...,3.19,0.04,,1,-1,-1,0,-1,-1,-2
8709,43345,"BEVER,FRUIT-FLAV DRK,PDR,W/ HI VIT C W/ OTHER ADD","Beverages, fruit-flavored drink, powder, with ...",Beverages,1.32,227,0.25,0.16,91.00,2.20,...,0.01,0.01,0.01,1,-1,-1,0,-1,-1,-2
8752,43529,"BABYFOOD,RICE&APPLS,DRY","Babyfood, rice and apples, dry",Baby Foods,3.20,396,6.70,2.40,86.89,3.00,...,1.40,0.37,,1,-1,-1,0,-1,-1,-2
8763,43570,"CEREALS RTE,POST,HONEY BUNCHES OF OATS,HONEY RSTD","Cereals ready-to-eat, POST, HONEY BUNCHES OF O...",Breakfast Cereals,5.00,401,7.12,5.46,81.19,4.20,...,5.03,0.59,1.28,1,-1,-1,0,-1,-1,-2
