In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

## importing dataset

In [3]:
dataset = pd.read_csv("sample_log_dataset.csv")
dataset

Unnamed: 0,Timestamp,Source,SourceClass,Destination,DestinationClass,User,Device,EventType,Description,Severity,MLRiskScore
0,2024-07-20T12:45:31,10.0.0.100,malicious,10.0.0.101,safe,anonymous,DeviceXYZ,credit-limit-change,Credit limit change requested for account 247....,low,0.50
1,2024-07-20T12:45:31,217.151.51.228:54125,malicious,10.0.0.101,safe,admin,ServerABC,money-laundering-suspicion,Potential money laundering activity detected f...,low,0.63
2,2024-07-20T12:45:31,10.0.0.100,malicious,10.10.10.20,safe,admin,ServerABC,auth-lockout,User 'admin' locked out after multiple failed ...,medium,0.57
3,2024-07-20T12:45:31,10.0.0.100,malicious,192.168.2.6,safe,user123,Workstation123,payee-added,New payee added to account 145.32.221.89,medium,0.47
4,2024-07-20T12:45:31,172.16.0.10,malicious,192.38.89.174,malicious,guest,ServerABC,auth-lockout,User 'guest' locked out after multiple failed ...,medium,0.83
...,...,...,...,...,...,...,...,...,...,...,...
495,2024-07-20T12:45:31,10.10.10.10,malicious,164.237.23.35,malicious,guest,ServerABC,payee-added,New payee added to account 75.148.42.172,high,0.73
496,2024-07-20T12:45:31,192.168.10.5,malicious,10.0.0.3,safe,admin,DeviceXYZ,transaction-large,Large transaction detected: $195774 for accoun...,medium,0.53
497,2024-07-20T12:45:31,192.168.2.5,malicious,10.1.1.2,safe,admin,ServerABC,transaction-unusual-location,Transaction from unusual location for account ...,low,0.60
498,2024-07-20T12:45:31,10.10.10.10,malicious,192.168.0.2,safe,guest,DeviceXYZ,identity-theft-alert,Potential identity theft detected for account ...,medium,0.63


## dropping redundant columns

In [4]:
dataset.drop(columns=['Source', 'Destination', 'Timestamp', 'Description'], inplace=True)
# dataset

## label encoding Source and Destination IPs (safe and malicious)

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
columns = ['SourceClass', 'DestinationClass']

dataset[columns] = dataset[columns].apply(le.fit_transform)
dataset

Unnamed: 0,SourceClass,DestinationClass,User,Device,EventType,Severity,MLRiskScore
0,0,1,anonymous,DeviceXYZ,credit-limit-change,low,0.50
1,0,1,admin,ServerABC,money-laundering-suspicion,low,0.63
2,0,1,admin,ServerABC,auth-lockout,medium,0.57
3,0,1,user123,Workstation123,payee-added,medium,0.47
4,0,0,guest,ServerABC,auth-lockout,medium,0.83
...,...,...,...,...,...,...,...
495,0,0,guest,ServerABC,payee-added,high,0.73
496,0,1,admin,DeviceXYZ,transaction-large,medium,0.53
497,0,1,admin,ServerABC,transaction-unusual-location,low,0.60
498,0,1,guest,DeviceXYZ,identity-theft-alert,medium,0.63


In [6]:
## Severity Column
event_severity_threat = {"informational": 0.1, "warning": 0.7, "error": 0.4, "critical": 0.9}
dataset['Severity'] = dataset['Severity'].replace(event_severity_threat)

## EventType Column
event_type_threat = {
    "auth-failed": 0.8,
    "auth-success": 0.1,
    "auth-lockout": 0.9,
    "network-connected": 0.2,
    "network-disconnected": 0.2,
    "firewall-change": 0.7,
    "dns-queries": 0.3,
    "malware-detection": 0.9,
    "system-shutdown": 0.8,
    "system-restart": 0.7,
    "system-failure": 0.9,
    "application-errors": 0.6,
    "application-usage": 0.2,
    "api-called": 0.4,
    "file-access": 0.5,
    "permission-changes": 0.3,
    "software-update": 0.6
}
dataset['EventType'] = dataset['EventType'].replace(event_type_threat)

## Device Column
devices = {"Workstation123": 1, "DeviceXYZ": 2, "ServerABC": 3}
dataset['Device'] = dataset['Device'].replace(devices)


## User Column
users = {"user123": 1, "guest": 2, "admin": 3, "anonymous":4}
dataset['User'] = dataset['User'].replace(users)

dataset['Access'] = np.random.randint(2, size=500)

dataset

  dataset['Device'] = dataset['Device'].replace(devices)
  dataset['User'] = dataset['User'].replace(users)


Unnamed: 0,SourceClass,DestinationClass,User,Device,EventType,Severity,MLRiskScore,Access
0,0,1,4,2,credit-limit-change,low,0.50,0
1,0,1,3,3,money-laundering-suspicion,low,0.63,1
2,0,1,3,3,0.9,medium,0.57,0
3,0,1,1,1,payee-added,medium,0.47,0
4,0,0,2,3,0.9,medium,0.83,1
...,...,...,...,...,...,...,...,...
495,0,0,2,3,payee-added,high,0.73,0
496,0,1,3,2,transaction-large,medium,0.53,1
497,0,1,3,3,transaction-unusual-location,low,0.60,1
498,0,1,2,2,identity-theft-alert,medium,0.63,0


## splitting into X and Y

In [7]:
X = dataset.iloc[:, :-2]
y = dataset.iloc[:,-2]
y2 = dataset.iloc[:,-1]

y2

0      0
1      1
2      0
3      0
4      1
      ..
495    0
496    1
497    1
498    0
499    1
Name: Access, Length: 500, dtype: int32

## importing catboost regressor

In [8]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(verbose = False)

## importing lightgbm regressor

In [9]:
import lightgbm as lgb 

lgb_model = lgb.LGBMRegressor()

## importing xgboost regressor

In [10]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor()

## stacking these models together

In [11]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor

base_regressors = [
    ('lightgbm', lgb_model),
    ('catboost', cat_model),
    ('xgboost', xgb_model)
]

meta_regressor = MLPRegressor(hidden_layer_sizes=(400, 200, 100, 50), activation='relu', solver='adam', random_state=42)


model = StackingRegressor(estimators=base_regressors, final_estimator=meta_regressor)
model

## implementing kfold

In [12]:
## For Risk Predicion
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)

predictions = np.zeros(len(X))

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    fold_preds = model.predict(X_test)
    fold_r2 = r2_score(y_test, fold_preds)
    print(f"R2 Score for this fold: {fold_r2}")

    predictions[test_index] += fold_preds

final_r2 = r2_score(y, predictions)
print(f"\nOverall R2 Score: {final_r2}")

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: EventType: object, Severity: object

In [None]:
X

Unnamed: 0,SourceClass,DestinationClass,User,Device,EventType,Severity
0,1,1,1,1,0.2,0.1
1,1,1,4,3,0.8,0.9
2,0,1,3,2,0.9,0.1
3,0,1,1,1,0.9,0.4
4,0,1,2,2,0.5,0.4
...,...,...,...,...,...,...
495,0,0,4,2,0.1,0.1
496,1,1,4,2,0.8,0.4
497,1,1,4,1,0.8,0.9
498,1,1,2,2,0.6,0.7


In [None]:
y

0      0.13
1      0.33
2      0.63
3      0.63
4      0.50
       ... 
495    0.63
496    0.33
497    0.33
498    0.27
499    0.17
Name: MLRiskScore, Length: 500, dtype: float64

In [None]:
y2

0      0
1      0
2      1
3      1
4      1
      ..
495    1
496    1
497    1
498    1
499    1
Name: Access, Length: 500, dtype: int64

In [None]:
dataset
new_dataset = dataset.iloc[:,:-1]
new_dataset

Unnamed: 0,SourceClass,DestinationClass,User,Device,EventType,Severity,MLRiskScore
0,1,1,1,1,0.2,0.1,0.13
1,1,1,4,3,0.8,0.9,0.33
2,0,1,3,2,0.9,0.1,0.63
3,0,1,1,1,0.9,0.4,0.63
4,0,1,2,2,0.5,0.4,0.50
...,...,...,...,...,...,...,...
495,0,0,4,2,0.1,0.1,0.63
496,1,1,4,2,0.8,0.4,0.33
497,1,1,4,1,0.8,0.9,0.33
498,1,1,2,2,0.6,0.7,0.27


In [None]:
## Predicting Access Revoked or Not
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from sklearn.metrics import r2_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
catboost_model = CatBoostClassifier(iterations=1000, depth=5, learning_rate=0.1, loss_function='Logloss', random_seed=42, verbose = False)

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)

predictions = np.zeros(len(X))

for train_index, test_index in kf.split(X):
    X_train, X_test = new_dataset.iloc[train_index], new_dataset.iloc[test_index]
    y_train, y_test = y2.iloc[train_index], y2.iloc[test_index]

    catboost_model.fit(X_train, y_train)
    fold_preds = catboost_model.predict(X_test)
    fold_r2 = r2_score(y_test, fold_preds)
#     print(f"R2 Score for this fold: {fold_r2}")

    predictions[test_index] += fold_preds

final_r2 = r2_score(y, predictions)
# print(f"\nOverall R2 Score: {final_r2}")