In [1]:
#import library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset
df = pd.read_csv('D:/rawdata/Emp_salary.csv') # specify the path where your dataset is
df.head(5)

Unnamed: 0,id,first_name,last_name,email,gender,ip_address,employee_id,salary,hire_date,job_title,department,location,performance_rating
0,1,Devlin,Kerslake,dkerslake0@yellowpages.com,Male,116.198.21.169,1,33077.32,9/30/2021,VP Sales,Marketing,Chicago,1.8
1,2,Zane,McConnel,zmcconnel1@who.int,Male,214.79.62.12,2,35845.3,10/6/2007,Research Nurse,HR,New York,4.2
2,3,Murielle,Houlson,mhoulson2@creativecommons.org,Female,135.221.237.42,3,20669.14,11/7/2012,Environmental Specialist,IT,Houston,2.7
3,4,Chrystal,Alabaster,calabaster3@netlog.com,Female,104.139.176.244,4,33435.96,4/13/2015,Design Engineer,Sales,New York,4.8
4,5,Lisbeth,Jerrome,ljerrome4@alexa.com,Female,105.205.64.66,5,14150.46,4/29/2006,Software Consultant,Sales,Chicago,3.4


In [3]:
# we will be selecting First_name, last_name, salary, department
df = df[['first_name', 'last_name', 'salary', 'department']]

In [4]:
# check how many records are there in the dataset
df.shape

(15, 4)

In [5]:
# view all the records
df.head(15)

Unnamed: 0,first_name,last_name,salary,department
0,Devlin,Kerslake,33077.32,Marketing
1,Zane,McConnel,35845.3,HR
2,Murielle,Houlson,20669.14,IT
3,Chrystal,Alabaster,33435.96,Sales
4,Lisbeth,Jerrome,14150.46,Sales
5,Martie,Tomasi,38709.56,Sales
6,Gladys,Eberts,39956.56,HR
7,Gipsy,Babonau,20186.0,Sales
8,Antonin,Ramalho,7627.79,IT
9,Logan,Hencke,47995.87,HR


In [6]:
# Select only salary for the isolation forest model test
df_salary = df[['salary']]
df_salary

Unnamed: 0,salary
0,33077.32
1,35845.3
2,20669.14
3,33435.96
4,14150.46
5,38709.56
6,39956.56
7,20186.0
8,7627.79
9,47995.87


In [7]:
# Instantiate the model and fit the data to it
model=IsolationForest(n_estimators=1000, max_samples='auto', contamination=float(0.04),max_features=1.0)
model.fit(df_salary[['salary']])

IsolationForest(contamination=0.04, n_estimators=1000)

In [8]:
# Get the score and anomaly flag
df_salary['scores']=model.decision_function(df[['salary']])
df_salary['anomaly']=model.predict(df[['salary']])

In [9]:
# view the data
df_salary

Unnamed: 0,salary,scores,anomaly
0,33077.32,0.153125,1
1,35845.3,0.192896,1
2,20669.14,0.117658,1
3,33435.96,0.162795,1
4,14150.46,0.078899,1
5,38709.56,0.175155,1
6,39956.56,0.190382,1
7,20186.0,0.121862,1
8,7627.79,-0.048872,-1
9,47995.87,0.0384,1


In [10]:
# fetch all the anomalies
anomaly=df_salary.loc[df_salary['anomaly']==-1]
anomaly_index=list(anomaly.index)
anomaly.head(40)

Unnamed: 0,salary,scores,anomaly
8,7627.79,-0.048872,-1


In [11]:
#merge the two dataframes to know which department's salary deviates from other departments
df_merged = pd.merge(df, df_salary, on=["salary"])
df_merged

Unnamed: 0,first_name,last_name,salary,department,scores,anomaly
0,Devlin,Kerslake,33077.32,Marketing,0.153125,1
1,Zane,McConnel,35845.3,HR,0.192896,1
2,Murielle,Houlson,20669.14,IT,0.117658,1
3,Chrystal,Alabaster,33435.96,Sales,0.162795,1
4,Lisbeth,Jerrome,14150.46,Sales,0.078899,1
5,Martie,Tomasi,38709.56,Sales,0.175155,1
6,Gladys,Eberts,39956.56,HR,0.190382,1
7,Gipsy,Babonau,20186.0,Sales,0.121862,1
8,Antonin,Ramalho,7627.79,IT,-0.048872,-1
9,Logan,Hencke,47995.87,HR,0.0384,1


In [12]:
# Test your model
# 1 ==> The amount is within the expected salary range
# -1 ==> The amount deviates from the expected salary range
model.predict([[500]])

array([-1])