## Import Libraries

`step 0`: Import semua library yang dibutuhkan

In [1]:
# Import semua library yang dibutuhkan
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch
from torch_geometric.data import HeteroData
import pickle
import json
import glob

## Merge LDAP folder

`step 1`: Gabungkan dulu folder LDAP menjadi users

In [2]:
# # Load semua file .csv dalam folder LDAP
# ldap_files = glob.glob('../data/r6.2/LDAP/*.csv')
# print(f"Found {len(ldap_files)} LDAP files:")

# # Membaca dan menggabungkan file .csv
# all_data = []
# for file in ldap_files:
#     print(f'Loading: {file}')
#     df = pd.read_csv(file)
#     print(f"  - {len(df)} rows, {df['user_id'].nunique()} unique users")
#     all_data.append(df)

# # Gabung semua data
# combined_df = pd.concat(all_data, ignore_index=True)
# print(f"\nBefore cleanup:")
# print(f"Total rows: {len(combined_df)}")
# print(f"Unique users: {combined_df['user_id'].nunique()}")

# # Remove duplicates berdasarkan user_id
# users_df = combined_df.drop_duplicates(subset=['user_id'], keep='first')
# print(f"\nAfter removing duplicates:")
# print(f"Total rows: {len(users_df)}")
# print(f"Unique users: {users_df['user_id'].nunique()}")

# # Save hasil
# users_df.to_csv('../data/r6.2/users.csv', index=False)
# print(f"\nSaved to: users.csv")
# print(f"Shape: {users_df.shape}")
# users_df.head()

## Load Data dan Filter Data

`step 2`: Load dataset dari file CSV </br>
- Pada tahapan ini, ada skenario unik dimana file http itu tidak dapat di load, alih-alih menggunakan pandas peneliti akan menggunakan dask untuk membaca file dengan data besar.

In [3]:
_raw_path = '../data/r6.2/'
_label_path = '../data/labels/'

# Load raw dataset
logon_df = pd.read_csv(f'{_raw_path}logon.csv')
users_df = pd.read_csv(f'{_raw_path}filtered_users.csv')
file_df = pd.read_csv(f'{_raw_path}file.csv')
device_df = pd.read_csv(f'{_raw_path}device.csv')
http_df = pd.read_csv(f'{_raw_path}filtered_http.csv')

# Load labels dataset
labels_df = pd.read_csv(f'{_label_path}user_labels.csv')

- Menggabungkan `users_df` dengan `labels_df` untuk mendapatkan label

In [4]:
# print(users_df.shape)
# print(labels_df.shape)

In [5]:
# users_df = users_df.merge(labels_df[['user_id','label']], on='user_id', how='left')
# users_df.head()

- Ambil user menjadi 1000 ID, dengan 998 normal, dan 2 anomali

In [6]:
insider_users = users_df[users_df['label'] == 1]
# normal_users = users_df[users_df['label'] == 0].sample(n=998, random_state=42)
normal_users = users_df[users_df['label'] == 0]
filtered_users = pd.concat([insider_users, normal_users]).reset_index(drop=True)
filtered_users.head()

Unnamed: 0,employee_name,user_id,email,role,projects,business_unit,functional_unit,department,team,supervisor,label
0,Murphy Benjamin Gentry,MBG3183,Murphy.Benjamin.Gentry@dtaa.com,ElectricalEngineer,,1,3 - ResearchAndEngineering_Government_Domestic,4 - Engineering,24 - SystemsEngineering,Malcolm Elton Battle,1
1,Chandler Martin Page,CMP2946,Chandler.Martin.Page@dtaa.com,Salesman,,1,9 - SalesAndMarketing_Government,2 - Sales,4 - RegionalSales,Tanisha Chiquita Mullins,1
2,Kirestin Kylan Carter,KKC2119,Kirestin.Kylan.Carter@dtaa.com,SystemsEngineer,,1,4 - ResearchAndEngineering_Government_Foreign,4 - Engineering,15 - SystemsEngineering,Leo Travis Rojas,0
3,Barry Xavier Knowles,BXK2091,Barry.Xavier.Knowles@dtaa.com,ElectricalEngineer,,1,4 - ResearchAndEngineering_Government_Foreign,4 - Engineering,13 - ElectricalEngineering,Barrett Ulysses Shepherd,0
4,Tatum Anastasia Fletcher,TAF0640,Tatum.Anastasia.Fletcher@dtaa.com,WebDeveloper,Project 199,1,3 - ResearchAndEngineering_Government_Domestic,3 - SoftwareManagement,4 - WebSoftware,Winifred Kelsie Garza,0


- Ubah nama fitur `user` menjadi `user_id` agar konsisten

In [7]:
# Logon dataframe
logon_df = logon_df.drop(columns={'id'}).rename(columns={'user': 'user_id'})
logon_df.head()

Unnamed: 0,date,user_id,pc,activity
0,01/02/2010 02:19:18,DNS1758,PC-0414,Logon
1,01/02/2010 02:31:12,DNS1758,PC-0414,Logoff
2,01/02/2010 02:34:02,DNS1758,PC-5313,Logon
3,01/02/2010 02:53:30,DNS1758,PC-5313,Logoff
4,01/02/2010 04:07:31,DNS1758,PC-0012,Logon


In [8]:
# File dataframe
file_df = file_df.drop(columns={'id'}).rename(columns={'user': 'user_id'})
file_df.head()

Unnamed: 0,date,user_id,pc,filename,activity,to_removable_media,from_removable_media,content
0,01/02/2010 07:19:41,SDH2394,PC-5849,R:\60WBQE7S.doc,File Open,False,True,"D0-CF-11-E0-A1-B1-1A-E1 Ernesztin's brother, L..."
1,01/02/2010 07:21:30,SDH2394,PC-5849,R:\0VGILDW8.pdf,File Write,True,False,25-50-44-46-2D ---- Bengali As do many other T...
2,01/02/2010 07:22:11,SDH2394,PC-5849,R:\60WBQE7S.doc,File Copy,False,True,"D0-CF-11-E0-A1-B1-1A-E1 Ernesztin's brother, L..."
3,01/02/2010 07:24:06,SDH2394,PC-5849,R:\22B5gX4\H8Y96RRE.doc,File Write,True,False,D0-CF-11-E0-A1-B1-1A-E1 After the death of his...
4,01/02/2010 07:24:45,SDH2394,PC-5849,R:\SDH2394\7XRCV2N5.pdf,File Copy,True,False,25-50-44-46-2D Although he restored some of th...


In [9]:
# Device dataframe
device_df = device_df.drop(columns={'id'}).rename(columns={'user': 'user_id'})
device_df.head()

Unnamed: 0,date,user_id,pc,file_tree,activity
0,01/02/2010 07:17:18,SDH2394,PC-5849,R:\;R:\22B5gX4;R:\SDH2394,Connect
1,01/02/2010 07:22:42,JKS2444,PC-6961,R:\;R:\JKS2444,Connect
2,01/02/2010 07:31:42,CBA1023,PC-1570,R:\;R:\42gY283;R:\48rr4y2;R:\59ntt61;R:\76xCQG...,Connect
3,01/02/2010 07:33:28,GNT0221,PC-6427,R:\;R:\GNT0221,Connect
4,01/02/2010 07:33:55,JKS2444,PC-6961,,Disconnect


In [10]:
# HTTP dataframe
# http_df = http_df.drop(columns={'id}).rename(columns={'user': 'user_id'})
http_df.head()

Unnamed: 0,date,user_id,url,activity
0,01/02/2010 06:38:15,LSN1672,foodnetwork.com,Visit
1,01/02/2010 06:39:44,LSN1672,wordpress.com,Visit
2,01/02/2010 06:40:11,LSN1672,examiner.com,Visit
3,01/02/2010 06:47:43,LSN1672,apple.com,Download
4,01/02/2010 06:48:22,LSN1672,latimes.com,Visit


- Filter semua dataset berdasarkan user yang dipilih

In [11]:
user_ids = set(filtered_users['user_id'])
logon_df = logon_df[logon_df['user_id'].isin(user_ids)]
file_df = file_df[file_df['user_id'].isin(user_ids)]
device_df = device_df[device_df['user_id'].isin(user_ids)]
http_df = http_df[http_df['user_id'].isin(user_ids)]

In [12]:
print(f"Filtered to {len(filtered_users)} users")
print(f"Insider users: {len(insider_users)}, Normal users: {len(normal_users)}")
print(f"Label distribution: {np.bincount(filtered_users['label'])}")

Filtered to 1000 users
Insider users: 2, Normal users: 998
Label distribution: [998   2]


## Nodes & Edges Structure

### Ekstrak User-features

`step 3`: Mengekstrak fitur user yang dibutuhkan. </br>
Struktur node user </br>
Sumber Data: users.csv + agregasi dari semua log files </br>
Fitur Node:

- role_encoded: Encoding jabatan berdasarkan users.csv
- department_encoded: Encoding departemen berdasarkan users.csv
- total_logon_events: Agregasi dari logon.csv
- total_file_events: Agregasi dari file.csv
- total_device_events: Agregasi dari device.csv
- total_http_events: Agregasi dari http.csv

In [13]:
# Hitung statistik per user
user_logon_stats = logon_df['user_id'].value_counts().to_frame('total_logon_events').reset_index()
user_file_stats = file_df['user_id'].value_counts().to_frame('total_file_events').reset_index()
user_device_stats = device_df['user_id'].value_counts().to_frame('total_device_events').reset_index()
user_http_stats = http_df['user_id'].value_counts().to_frame('total_http_events').reset_index()

# Gabungkan semua statistik User
user_features_df = filtered_users.merge(user_logon_stats, on='user_id', how='left')
user_features_df = user_features_df.merge(user_file_stats, on='user_id', how='left')
user_features_df = user_features_df.merge(user_device_stats, on='user_id', how='left')
user_features_df = user_features_df.merge(user_http_stats, on='user_id', how='left').fillna(0)

user_features = {
    'user_id': user_features_df['user_id'].values,
    'role': user_features_df['role'].values,
    'department': user_features_df['department'].values,
    'labels': user_features_df['label'].values,
    'total_logon_events': user_features_df['total_logon_events'].values,
    'total_file_events': user_features_df['total_file_events'].values,
    'total_device_events': user_features_df['total_device_events'].values,
    'total_http_events': user_features_df['total_http_events'].values
}

In [14]:
# Role encoding
role_encoder = LabelEncoder()
user_features['role_encoded'] = role_encoder.fit_transform(user_features['role'].astype(str))

# Department encoding  
dept_encoder = LabelEncoder()
user_features['department_encoded'] = dept_encoder.fit_transform(user_features['department'].astype(str))

In [15]:
print(f"User features extracted for {len(user_features['user_id'])} users")
print(f"Features: {list(user_features.keys())}")

User features extracted for 1000 users
Features: ['user_id', 'role', 'department', 'labels', 'total_logon_events', 'total_file_events', 'total_device_events', 'total_http_events', 'role_encoded', 'department_encoded']


### Ekstrak PC-features

`step 4`: Ekstrak fitur untuk node PC berdasarkan agregasi aktivitas dari logon, file, dan device logs. </br>
Sumber Data: Agregasi dari logon.csv, file.csv, device.csv </br>
Fitur Node:
- unique_users_count: Jumlah user unik yang menggunakan PC ini
- avg_daily_logons: Rata-rata logon per hari ke PC ini
- total_file_operations: Total operasi file di PC ini
- total_device_connections: Total koneksi device di PC ini

In [16]:
# Ekstrak fitur untuk node PC
pc_logon_stats = logon_df.groupby('pc').agg({'user_id': 'nunique', 'date': 'count'}).reset_index()
pc_logon_stats.columns = ['pc', 'unique_users_count', 'total_logons']

pc_file_stats = file_df.groupby('pc').agg({'user_id': 'count'}).reset_index()
pc_file_stats.columns = ['pc', 'total_file_operations']

pc_device_stats = device_df.groupby('pc').agg({'user_id': 'count'}).reset_index()
pc_device_stats.columns = ['pc', 'total_device_connections']

In [17]:
# Gabungkan semua statistik PC
pc_features_df = pc_logon_stats.merge(pc_file_stats, on='pc', how='left')
pc_features_df = pc_features_df.merge(pc_device_stats, on='pc', how='left').fillna(0)
pc_features_df['avg_daily_logons'] = pc_features_df['total_logons'] / 365

pc_features = {
    'pc': pc_features_df['pc'].values,
    'unique_users_count': pc_features_df['unique_users_count'].values,
    'avg_daily_logons': pc_features_df['avg_daily_logons'].values,
    'total_file_operations': pc_features_df['total_file_operations'].values,
    'total_device_connections': pc_features_df['total_device_connections'].values
}

In [18]:
print(f"PC features extracted for {len(pc_features['pc'])} PCs")
print(f"Features: {list(pc_features.keys())}")

PC features extracted for 4298 PCs
Features: ['pc', 'unique_users_count', 'avg_daily_logons', 'total_file_operations', 'total_device_connections']


### Ekstrak URL-features

`step 5`: Kategorisasi domain URL berdasarkan jenis website dan ekstrak fitur untuk node URL berdasarkan pola akses. </br>
Sumber Data: http.csv </br>
Fitur Node: </br>
- domain_category: Kategori berdasarkan domain (work_related, cloud_storage, job_search, social_media, other)
- total_visits: Total kunjungan ke URL ini
- unique_visitors: Jumlah user unik yang mengakses

In [19]:
# Kategorisasi domain URL berdasarkan jenis
def categorize_url_domain(url):
    if pd.isna(url) or url is None:
        return 'unknown'
    url_lower = str(url).lower()
    if any(x in url_lower for x in ['dropbox', 'inbox', 'redbox', 'soundcloud']):
        return 'cloud_storage'
    elif any(x in url_lower for x in ['careerbuilder', 'foodnetwork', 'howstuffworks', 'indeed', 'job-hunt.org', 'jobhuntersbible', 'linkedin', 'monster', 'networkedblogs', 'networksolutions']):
        return 'job_search'
    elif any(x in url_lower for x in ['facebook', 'livingsocial', 'reddit', 'twitter']):
        return 'social_media'
    elif any(x in url_lower for x in ['.gov', '.edu', 'officedepot']):
        return 'work_related'
    else:
        return 'other'

In [20]:
# Ekstrak fitur untuk node URL
url_stats = http_df.groupby('url').agg({'user_id': ['count', 'nunique']}).reset_index()
url_stats.columns = ['url', 'total_visits', 'unique_visitors']
url_stats['domain_category'] = url_stats['url'].apply(categorize_url_domain)

url_features = {
    'url': url_stats['url'].values,
    'domain_category': url_stats['domain_category'].values,
    'total_visits': url_stats['total_visits'].values,
    'unique_visitors': url_stats['unique_visitors'].values
}

In [21]:
print(f"URL features extracted for {len(url_features['url'])} URLs")
print(f"URL categories: {np.unique(url_features['domain_category'])}")
print(f"Features: {list(url_features.keys())}")

URL features extracted for 476 URLs
URL categories: ['cloud_storage' 'job_search' 'other' 'social_media' 'work_related']
Features: ['url', 'domain_category', 'total_visits', 'unique_visitors']


### Ekstrak User-PC Edge Features

`step 6`: Ekstrak fitur untuk edge User→PC (user mengakses pc) berdasarkan pola logon, file operations, dan device usage. </br>
Sumber Data: logon.csv, file.csv, device.csv </br>
Fitur Edge:
- logon_count: Jumlah event Logon
- logoff_count: Jumlah event Logoff
- file_open_count: File operations dengan activity="Open"
- file_write_count: File operations dengan activity="Write"
- file_copy_count: File operations dengan activity="Copy"
- file_delete_count: File operations dengan activity="Delete"
- device_connect_count: Device events dengan activity="Connect"
- device_disconnect_count: Device events dengan activity="Disconnect"
- after_hours_logon: Login di luar jam kerja (sebelum 07:00 atau setelah 17:00)
- weekend_logon: Login di akhir pekan (Sabtu-Minggu)

In [22]:
# Ekstrak fitur untuk edge User→PC
logon_df['datetime'] = pd.to_datetime(logon_df['date'])
logon_df['hour'] = logon_df['datetime'].dt.hour
logon_df['dayofweek'] = logon_df['datetime'].dt.dayofweek

In [23]:
# Fitur logon (login/logout, jam kerja, weekend)
logon_features = logon_df.groupby(['user_id', 'pc']).agg({
    'activity': lambda x: (x == 'Logon').sum(),
    'hour': lambda x: sum((x < 7) | (x > 17)),
    'dayofweek': lambda x: sum(x >= 5)
}).reset_index()
logon_features.columns = ['user_id', 'pc', 'logon_count', 'after_hours_logon', 'weekend_logon']

# Hitung logoff count
logoff_count = logon_df[logon_df['activity'] == 'Logoff'].groupby(['user_id', 'pc']).size().reset_index()
logoff_count.columns = ['user_id', 'pc', 'logoff_count']

In [24]:
# Fitur file operations (open, write, copy, delete)
file_activities = ['File Open', 'File Write', 'File Copy', 'File Delete']
result = []
for (user_id, pc), group in file_df.groupby(['user_id', 'pc']):
    counts = [sum(group['activity'] == act) for act in file_activities]
    result.append([user_id, pc] + counts)

file_features = pd.DataFrame(result, columns=['user_id', 'pc', 'file_open_count', 'file_write_count', 'file_copy_count', 'file_delete_count'])

In [25]:
# Fitur device (connect/disconnect USB)
device_features = device_df.groupby(['user_id', 'pc'])['activity'].apply(
    lambda x: pd.Series([sum(x == 'Connect'), sum(x == 'Disconnect')])
).reset_index()
device_features.columns = ['user_id', 'pc', 'device_connect_count', 'device_disconnect_count']

In [26]:
# Gabungkan semua fitur edge User-PC
user_pc_edges = logon_features.merge(logoff_count, on=['user_id', 'pc'], how='left')
user_pc_edges = user_pc_edges.merge(file_features, on=['user_id', 'pc'], how='left')
user_pc_edges = user_pc_edges.merge(device_features, on=['user_id', 'pc'], how='left')

In [27]:
print(f"User-PC edges: {len(user_pc_edges)}")
print(f"Features: {list(user_pc_edges.keys())}")

User-PC edges: 44991
Features: ['user_id', 'pc', 'logon_count', 'after_hours_logon', 'weekend_logon', 'logoff_count', 'file_open_count', 'file_write_count', 'file_copy_count', 'file_delete_count', 'device_connect_count', 'device_disconnect_count']


### Ekstrak User-URL Edge Features

`step 7`: Ekstrak fitur untuk edge User→URL (user mengunjungi situs) berdasarkan pola browsing dan akses ke kategori website tertentu. </br>
Sumber Data: http.csv </br>
Fitur Edge:
- visit_frequency: Jumlah kunjungan ke URL
- cloud_service_visits: Kunjungan ke layanan cloud (filter berdasarkan domain)
- job_site_visits: Kunjungan ke situs lowongan kerja (filter berdasarkan domain)
- unique_visit_days: Jumlah hari berbeda mengakses URL
- after_hours_browsing: Web browsing di luar jam kerja

In [28]:
# Ekstrak fitur untuk edge User→URL
http_df['datetime'] = pd.to_datetime(http_df['date'])
http_df['hour'] = http_df['datetime'].dt.hour
http_df['date_only'] = http_df['datetime'].dt.date

In [None]:
# Agregasi aktivitas browsing per user-url
user_url_edges = http_df.groupby(['user_id', 'url']).agg({
    'date_only': ['count', 'nunique'],  # count total visits, nunique untuk unique days
    'hour': lambda x: sum((x < 7) | (x > 17))
}).reset_index()

# Flatten multi-level columns
user_url_edges.columns = ['user_id', 'url', 'visit_frequency', 'unique_visit_days', 'after_hours_browsing']

In [None]:
# Buat boolean indicator untuk cloud storage dan job search sites
user_url_edges['is_cloud_storage'] = user_url_edges['url'].apply(
    lambda url: 1 if categorize_url_domain(url) == 'cloud_storage' else 0)
user_url_edges['is_job_search'] = user_url_edges['url'].apply(
    lambda url: 1 if categorize_url_domain(url) == 'job_search' else 0)

# Hitung total kunjungan per user ke cloud storage dan job search sites
cloud_visits_per_user = http_df[http_df['url'].apply(categorize_url_domain) == 'cloud_storage'].groupby('user_id').size()
job_visits_per_user = http_df[http_df['url'].apply(categorize_url_domain) == 'job_search'].groupby('user_id').size()

user_url_edges['cloud_service_visits'] = user_url_edges['user_id'].map(cloud_visits_per_user).fillna(0) * user_url_edges['is_cloud_storage']
user_url_edges['job_site_visits'] = user_url_edges['user_id'].map(job_visits_per_user).fillna(0) * user_url_edges['is_job_search']

# Drop kolom helper
user_url_edges = user_url_edges.drop(['is_cloud_storage', 'is_job_search'], axis=1)

In [None]:
print(f"User-URL edges: {len(user_url_edges)}")
print(f"Features: {list(user_url_edges.keys())}")

User-URL edges: 171361
Features: ['user_id', 'url', 'visit_frequency', 'unique_visit_days', 'after_hours_browsing', 'cloud_service_visits', 'job_site_visits']


## Train-Validation Split

`step 8`: Bagi data menjadi training dan validation set dengan stratified sampling untuk menjaga proporsi kelas insider.

In [None]:
# Bagi data menjadi train dan validation set
labels = user_features['labels']
user_indices = np.arange(len(labels))

train_idx, val_idx = train_test_split(user_indices, test_size=0.2, stratify=labels, random_state=42)

train_mask = np.zeros(len(labels), dtype=bool)
val_mask = np.zeros(len(labels), dtype=bool)
train_mask[train_idx] = True
val_mask[val_idx] = True

In [None]:
print(f"Split sizes - Train: {train_mask.sum()}, Val: {val_mask.sum()}")
print(f"Train labels: {labels[train_mask].sum()}/{train_mask.sum()}")
print(f"Val labels: {labels[val_mask].sum()}/{val_mask.sum()}")

Split sizes - Train: 800, Val: 200
Train labels: 2/800
Val labels: 0/200


## Oversampling Technique

`step 9`: Random Oversampling dilakukan untuk mengatasi class imbalance antara insider dan normal users.

In [None]:
# Oversampling untuk mengatasi class imbalance
train_indices = np.where(train_mask)[0]
train_labels = user_features['labels'][train_indices]

ros = RandomOverSampler(random_state=42)
train_indices_resampled, train_labels_resampled = ros.fit_resample(
    train_indices.reshape(-1, 1), train_labels)
train_indices_resampled = train_indices_resampled.flatten()

print(f"Before oversampling: {np.bincount(train_labels)}")
print(f"After oversampling: {np.bincount(train_labels_resampled)}")

Before oversampling: [798   2]
After oversampling: [798 798]


In [None]:
# Update train mask setelah oversampling
balanced_train_mask = np.zeros(len(user_features['labels']), dtype=bool)
unique_indices = np.unique(train_indices_resampled)
balanced_train_mask[unique_indices] = True

oversampling_info = {
    'resampled_indices': train_indices_resampled,
    'resampled_labels': train_labels_resampled,
    'original_train_mask': train_mask.copy()
}

## Normalize Features

`step 10`: Encode categorical features dan normalize numerical features menggunakan StandardScaler yang di-fit pada training data.

In [None]:
# Bersihkan data terlebih dahulu - convert semua ke string dan handle missing values
user_features['role'] = np.where(pd.isna(user_features['role']), 'Unknown', user_features['role'].astype(str))
user_features['department'] = np.where(pd.isna(user_features['department']), 'Unknown', user_features['department'].astype(str))
url_features['domain_category'] = np.where(pd.isna(url_features['domain_category']), 'Unknown', url_features['domain_category'].astype(str))

# Encode kategorikal features - fit pada seluruh data, bukan hanya training
role_encoder = LabelEncoder()
dept_encoder = LabelEncoder()
url_cat_encoder = LabelEncoder()

role_encoder.fit(user_features['role'])
dept_encoder.fit(user_features['department'])
url_cat_encoder.fit(url_features['domain_category'])

user_features['role_encoded'] = role_encoder.transform(user_features['role'])
user_features['department_encoded'] = dept_encoder.transform(user_features['department'])
url_features['domain_category_encoded'] = url_cat_encoder.transform(url_features['domain_category'])

In [None]:
# Normalisasi fitur numerik User
numerical_user_cols = ['total_logon_events',
                      'total_file_events', 'total_device_events', 'total_http_events']

user_scaler = StandardScaler()
train_user_data = np.column_stack([user_features[col][train_indices] for col in numerical_user_cols])
user_scaler.fit(train_user_data)

all_user_data = np.column_stack([user_features[col] for col in numerical_user_cols])
user_features_scaled = user_scaler.transform(all_user_data)
user_features_final = np.column_stack([user_features_scaled, user_features['role_encoded'], user_features['department_encoded']])

In [None]:
# Normalisasi fitur PC
pc_scaler = StandardScaler()
pc_data = np.column_stack([pc_features[col] for col in ['unique_users_count', 'avg_daily_logons', 'total_file_operations', 'total_device_connections']])
pc_features_scaled = pc_scaler.fit_transform(pc_data)

In [None]:
# Normalisasi fitur URL
url_scaler = StandardScaler()
url_data = np.column_stack([url_features['total_visits'], url_features['unique_visitors']])
url_features_scaled = url_scaler.fit_transform(url_data)
url_features_final = np.column_stack([url_features_scaled, url_features['domain_category_encoded']])

In [None]:
# Normalisasi fitur edges
user_pc_cols = ['logon_count', 'logoff_count', 'file_open_count', 'file_write_count',
                'file_copy_count', 'file_delete_count', 'device_connect_count',
                'device_disconnect_count', 'after_hours_logon', 'weekend_logon']
user_pc_scaler = StandardScaler()
user_pc_data = np.column_stack([user_pc_edges[col] for col in user_pc_cols])
user_pc_edges_scaled = user_pc_scaler.fit_transform(user_pc_data)

user_url_cols = ['visit_frequency', 'cloud_service_visits', 'job_site_visits', 'unique_visit_days', 'after_hours_browsing']
user_url_scaler = StandardScaler()
user_url_data = np.column_stack([user_url_edges[col] for col in user_url_cols])
user_url_edges_scaled = user_url_scaler.fit_transform(user_url_data)

print("Feature normalization completed")

Feature normalization completed


## Build Edge Indices

`step 11`: Buat mapping dari ID ke index dan konversi edge list menjadi tensor format untuk PyTorch Geometric.

In [None]:
# Buat mapping dari ID ke index untuk graph
user_id_to_idx = {uid: idx for idx, uid in enumerate(user_features['user_id'])}
pc_to_idx = {pc: idx for idx, pc in enumerate(pc_features['pc'])}
url_to_idx = {url: idx for idx, url in enumerate(url_features['url'])}

In [None]:
# Konversi edge list ke tensor format
user_pc_edge_index = []
for _, row in user_pc_edges.iterrows():
    user_idx = user_id_to_idx[row['user_id']]
    pc_idx = pc_to_idx[row['pc']]
    user_pc_edge_index.append([user_idx, pc_idx])
user_pc_edge_index = np.array(user_pc_edge_index)

user_url_edge_index = []
for _, row in user_url_edges.iterrows():
    user_idx = user_id_to_idx[row['user_id']]
    url_idx = url_to_idx[row['url']]
    user_url_edge_index.append([user_idx, url_idx])
user_url_edge_index = np.array(user_url_edge_index)

In [None]:
print("Edge indices built")
print(f"User-PC edges: {len(user_pc_edge_index)}")
print(f"User-URL edges: {len(user_url_edge_index)}")

Edge indices built
User-PC edges: 44991
User-URL edges: 171361


## Build Heterogeneous Graph

`step 12`: Konstruksi heterogeneous graph menggunakan PyTorch Geometric dengan semua node, edge, dan attributes.

In [None]:
# Buat heterogeneous graph menggunakan PyTorch Geometric
data = HeteroData()

# Set node features
data['user'].x = torch.tensor(user_features_final, dtype=torch.float)
data['pc'].x = torch.tensor(pc_features_scaled, dtype=torch.float)
data['url'].x = torch.tensor(url_features_final, dtype=torch.float)

# Set labels dan masks
data['user'].y = torch.tensor(user_features['labels'], dtype=torch.long)
data['user'].train_mask = torch.tensor(balanced_train_mask, dtype=torch.bool)
data['user'].val_mask = torch.tensor(val_mask, dtype=torch.bool)

In [None]:
# Set edges dan edge attributes
data['user', 'uses', 'pc'].edge_index = torch.tensor(user_pc_edge_index.T, dtype=torch.long)
data['user', 'uses', 'pc'].edge_attr = torch.tensor(user_pc_edges_scaled, dtype=torch.float)

data['user', 'visits', 'url'].edge_index = torch.tensor(user_url_edge_index.T, dtype=torch.long)
data['user', 'visits', 'url'].edge_attr = torch.tensor(user_url_edges_scaled, dtype=torch.float)

data.oversampling_info = oversampling_info

In [None]:
print(f"Graph Statistics:")
print(f"- Users: {data['user'].x.shape[0]} ({data['user'].x.shape[1]} features)")
print(f"- PCs: {data['pc'].x.shape[0]} ({data['pc'].x.shape[1]} features)")
print(f"- URLs: {data['url'].x.shape[0]} ({data['url'].x.shape[1]} features)")
print(f"- User-PC edges: {data['user', 'uses', 'pc'].edge_index.shape[1]}")
print(f"- User-URL edges: {data['user', 'visits', 'url'].edge_index.shape[1]}")

Graph Statistics:
- Users: 1000 (6 features)
- PCs: 4298 (4 features)
- URLs: 477 (3 features)
- User-PC edges: 44991
- User-URL edges: 171361


## Save Data

`step 13`: Simpan processed graph, preprocessing objects, dan metadata untuk digunakan dalam training phase.

In [None]:
# Simpan graph dan preprocessing objects
torch.save(data, '../data/data_graph.pt')

scalers = {
    'user_scaler': user_scaler,
    'pc_scaler': pc_scaler,
    'url_scaler': url_scaler,
    'user_pc_scaler': user_pc_scaler,
    'user_url_scaler': user_url_scaler
}

encoders = {
    'role_encoder': role_encoder,
    'dept_encoder': dept_encoder,
    'url_cat_encoder': url_cat_encoder
}

preprocessing_objects = {**scalers, **encoders}
with open('../data/data_objects.pkl', 'wb') as f:
    pickle.dump(preprocessing_objects, f)

metadata = {
    'scenario': 'insider_threat_detection',
    'num_users': data['user'].x.shape[0],
    'num_pcs': data['pc'].x.shape[0],
    'num_urls': data['url'].x.shape[0],
    'train_size': data['user'].train_mask.sum().item(),
    'val_size': data['user'].val_mask.sum().item(),
    'insider_count': data['user'].y.sum().item()
}

with open('../data/logs/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("Data telah disimpan!")

Data telah disimpan!
