In [85]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report as report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import confusion_matrix as matrix
from sklearn.metrics import average_precision_score
from sklearn.ensemble import IsolationForest
RANDOM_STATE = 42
np.random.seed(seed=RANDOM_STATE)

# Dataset1: http(contamination:0.0039)

## load train/test data

In [86]:
http_train=pd.read_csv('data/http_train.csv',sep=',')
http_test=pd.read_csv('data/http_test.csv',sep=',')

#train_raw_data = http_train.drop(http_train.index[0])
train_data = http_train.drop(http_train.columns[-1],axis='columns')
train_label = http_train.iloc[:,-1]

#test_raw_data = http_test.drop(http_test.index[0])
test_data = http_test.drop(http_test.columns[-1],axis='columns')
test_label = http_test.iloc[:,-1]

#train_data.head()
http_train.shape

(453998, 4)

## build a model(isolation forest)

In [89]:
IF_model = IsolationForest(max_samples="auto",random_state=42,contamination=0.0039, behaviour='new')
# param_grid = {'n_estimators': list(range(100, 800, 5)), 
#               'max_samples': list(range(100, 500, 5)), 
#               'contamination': [0.1, 0.2, 0.3, 0.4, 0.5], 
#               'bootstrap': [True, False], 
#               'n_jobs': [5, 10, 20, 30]}
# IF_model = GridSearchCV(IF_model, param_grid ,scoring='average_precision', cv=5)
IF_model.fit(train_data,train_label)

IsolationForest(behaviour='new', bootstrap=False, contamination=0.0039,
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=None, random_state=42, verbose=0, warm_start=False)

## make predict

In [90]:
result_model=IF_model.predict(test_data)
#report_model=report(test_label,result_model,digits=5)
for i in range(len(result_model)):
    if(result_model[i]==-1):
        result_model[i]=1
    else:
        result_model[i]=0
#print(result_model)
report_model=report(test_label,result_model,digits=5)
matrix_model=matrix(test_label,result_model)
print(report_model)
print(matrix_model)

              precision    recall  f1-score   support

           0    0.99625   0.99980   0.99802    113058
           1    0.42500   0.03846   0.07054       442

    accuracy                        0.99605    113500
   macro avg    0.71063   0.51913   0.53428    113500
weighted avg    0.99403   0.99605   0.99441    113500

[[113035     23]
 [   425     17]]


# Dataset2: Cardio (Contamination : 0.096)

## load train/test data

In [78]:
cardio_train=pd.read_csv('data/cardio_train.csv',sep=',')
cardio_test=pd.read_csv('data/cardio_test.csv',sep=',')

#train_raw_data = http_train.drop(http_train.index[0])
train_data = cardio_train.drop(cardio_train.columns[-1],axis='columns')
train_label = cardio_train.iloc[:,-1]

#test_raw_data = http_test.drop(http_test.index[0])
test_data = cardio_test.drop(cardio_test.columns[-1],axis='columns')
test_label = cardio_test.iloc[:,-1]

#train_data.head()
cardio_train.shape

(1464, 22)

## build a model

In [79]:
IF_model = IsolationForest(max_samples="auto",random_state=42,contamination=0.096, behaviour='new')
# param_grid = {'n_estimators': list(range(100, 800, 5)), 
#               'max_samples': list(range(100, 500, 5)), 
#               'contamination': [0.1, 0.2, 0.3, 0.4, 0.5], 
#               'bootstrap': [True, False], 
#               'n_jobs': [5, 10, 20, 30]}
# IF_model = GridSearchCV(IF_model, param_grid ,scoring='average_precision', cv=5)
IF_model.fit(train_data,train_label)

IsolationForest(behaviour='new', bootstrap=False, contamination=0.096,
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=None, random_state=42, verbose=0, warm_start=False)

## make predict

In [80]:
result_model=IF_model.predict(test_data)
#report_model=report(test_label,result_model,digits=5)
for i in range(len(result_model)):
    if(result_model[i]==-1):
        result_model[i]=1
    else:
        result_model[i]=0
#print(result_model)
report_model=report(test_label,result_model,digits=5)
matrix_model=matrix(test_label,result_model)
print(report_model)
print(matrix_model)

              precision    recall  f1-score   support

           0    0.94769   0.92771   0.93760       332
           1    0.42857   0.51429   0.46753        35

    accuracy                        0.88828       367
   macro avg    0.68813   0.72100   0.70256       367
weighted avg    0.89818   0.88828   0.89277       367

[[308  24]
 [ 17  18]]


# Dataset3: creditcard (0.00173)

## load data

In [81]:
credit_train=pd.read_csv('data/credit_train.csv',sep=',')
credit_test=pd.read_csv('data/credit_test.csv',sep=',')

#train_raw_data = http_train.drop(http_train.index[0])
train_data = credit_train.drop(credit_train.columns[-1],axis='columns')
train_label = credit_train.iloc[:,-1]

#test_raw_data = http_test.drop(http_test.index[0])
test_data = credit_test.drop(credit_test.columns[-1],axis='columns')
test_label = credit_test.iloc[:,-1]

#train_data.head()
credit_train.shape

(227845, 31)

## build a model

In [82]:
IF_model = IsolationForest(max_samples="auto",random_state=42,contamination=0.00173, behaviour='new')
# param_grid = {'n_estimators': list(range(100, 800, 5)), 
#               'max_samples': list(range(100, 500, 5)), 
#               'contamination': [0.1, 0.2, 0.3, 0.4, 0.5], 
#               'bootstrap': [True, False], 
#               'n_jobs': [5, 10, 20, 30]}
# IF_model = GridSearchCV(IF_model, param_grid ,scoring='average_precision', cv=5)
IF_model.fit(train_data,train_label)

IsolationForest(behaviour='new', bootstrap=False, contamination=0.00173,
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=None, random_state=42, verbose=0, warm_start=False)

## make predict

In [83]:
result_model=IF_model.predict(test_data)
#report_model=report(test_label,result_model,digits=5)
for i in range(len(result_model)):
    if(result_model[i]==-1):
        result_model[i]=1
    else:
        result_model[i]=0
#print(result_model)
report_model=report(test_label,result_model,digits=5)
matrix_model=matrix(test_label,result_model)
print(report_model)
print(matrix_model)

              precision    recall  f1-score   support

           0    0.99886   0.99875   0.99880     56864
           1    0.31731   0.33673   0.32673        98

    accuracy                        0.99761     56962
   macro avg    0.65808   0.66774   0.66277     56962
weighted avg    0.99768   0.99761   0.99765     56962

[[56793    71]
 [   65    33]]
