<h3>This file processes the merged.pkl file into dependent and independent variales y.pkl and X.pkl respectively</h3>

Also separates scenario-specific datasets<br>
merged.pkl is obtained from the original work for DANTE

In [24]:
import ast
import numpy as np
import pandas as pd

System Logs File

In [None]:
df = pd.read_pickle(r'merged.pkl').drop(['content'], axis=1).reset_index()

In [27]:
df.head()

Unnamed: 0,user,day,action_id
0,AAB1302,2010-01-02,"[[116, 116, 117, 116, 116, 116, 116, 116, 116,..."
1,AAB1302,2010-01-03,[nan]
2,AAB1302,2010-01-04,"[[116, 117, 116, 116, 116, 116, 116, 116, 117,..."
3,AAB1302,2010-01-05,"[[117, 116, 117, 116, 106, 106, 106, 106, 114,..."
4,AAB1302,2010-01-06,"[[116, 116, 117, 116, 117, 116, 116, 117, 117,..."


In [28]:
df.shape

(1034000, 3)

In [29]:
# making action_id 1-D (Gemini)

def robust_convert(x):
    try:
        # This is the "happy path" for strings like '[116, 117, ...]'
        return np.array(ast.literal_eval(x[0]))
    except Exception:
        # If ast.literal_eval fails FOR ANY REASON (malformed string, etc.),
        # just return np.nan.
        return np.array([np.nan])

# --- Apply the new, robust function ---
df['action_id'] = df['action_id'].apply(robust_convert)

In [30]:
df.head()

Unnamed: 0,user,day,action_id
0,AAB1302,2010-01-02,"[116, 116, 117, 116, 116, 116, 116, 116, 116, ..."
1,AAB1302,2010-01-03,[nan]
2,AAB1302,2010-01-04,"[116, 117, 116, 116, 116, 116, 116, 116, 117, ..."
3,AAB1302,2010-01-05,"[117, 116, 117, 116, 106, 106, 106, 106, 114, ..."
4,AAB1302,2010-01-06,"[116, 116, 117, 116, 117, 116, 116, 117, 117, ..."


Answers File

In [None]:
ans = pd.read_csv(r'insiders.csv')

In [32]:
ans.head()

Unnamed: 0,dataset,scenario,details,user,start,end
0,2.0,1,r2.csv,ONS0995,3/6/2010 1:41:56,3/20/2010 8:10:12
1,3.1,1,r3.1-1.csv,CSF0929,07/01/2010 01:24:58,07/16/2010 06:52:00
2,3.1,2,r3.1-2.csv,CCH0959,08/02/2010 10:34:31,09/30/2010 15:04:03
3,3.2,1,r3.2-1.csv,RCW0822,09/29/2010 21:10:27,10/15/2010 06:34:52
4,3.2,2,r3.2-2.csv,JCE0258,07/12/2010 08:16:02,09/03/2010 16:16:29


In [33]:
# only for 5.2, with relevant columns

ans = ans[ans['dataset'] == 5.2].drop(['dataset', 'details'], axis=1)

In [35]:
# turning day, start and end into datetimes
df['day'] = pd.to_datetime(df['day'])
ans['end'] = pd.to_datetime(ans['end'])
ans['start'] = pd.to_datetime(ans['start'])

Making the 'Malicious' Column

In [36]:
df = df.merge(ans, on='user', how='left')
df['malicious'] = (df['day'] >= df['start']) & (df['day'] <= df['end']) # Gemini
df.drop(['start', 'end'], axis=1, inplace=True)

df.head()

Unnamed: 0,user,day,action_id,scenario,malicious
0,AAB1302,2010-01-02,"[116, 116, 117, 116, 116, 116, 116, 116, 116, ...",,False
1,AAB1302,2010-01-03,[nan],,False
2,AAB1302,2010-01-04,"[116, 117, 116, 116, 116, 116, 116, 116, 117, ...",,False
3,AAB1302,2010-01-05,"[117, 116, 117, 116, 106, 106, 106, 106, 114, ...",,False
4,AAB1302,2010-01-06,"[116, 116, 117, 116, 117, 116, 116, 117, 117, ...",,False


Limiting The Range of Action Lengths

In [37]:
# making the action_id arrays the same length (Gemini)

max_length = 250
min_length = 10

# 1. This line is correct
df = df[df['action_id'].apply(len) >= min_length]

# 2. This line is also correct (slicing works on arrays)
df['action_id'] = df['action_id'].apply(lambda x: x[:max_length])

# 3. This is the corrected padding line
df['action_id'] = df['action_id'].apply(lambda x: np.pad(x, (0, max_length - len(x)),
                                                  'constant', constant_values=0))

In [38]:
df.head()

Unnamed: 0,user,day,action_id,scenario,malicious
0,AAB1302,2010-01-02,"[116, 116, 117, 116, 116, 116, 116, 116, 116, ...",,False
2,AAB1302,2010-01-04,"[116, 117, 116, 116, 116, 116, 116, 116, 117, ...",,False
3,AAB1302,2010-01-05,"[117, 116, 117, 116, 106, 106, 106, 106, 114, ...",,False
4,AAB1302,2010-01-06,"[116, 116, 117, 116, 117, 116, 116, 117, 117, ...",,False
5,AAB1302,2010-01-07,"[116, 116, 117, 116, 116, 116, 116, 117, 107, ...",,False


In [39]:
df.shape # size after applying range limit

(586809, 5)

In [40]:
# exporting dependent and independent files: X.pkl and y.pkl

df['action_id'].to_pickle('X.pkl')
df['malicious'].to_pickle('y.pkl')

Scenario-specific Datasets

In [57]:
make_scenario = lambda scenario: df[df['scenario'] == scenario] # returns dataset for given scenario

In [68]:
for i in range(1, 5): # scenarios 1 through 4
    data = make_scenario(i)

    data['action_id'].to_pickle(f'X_S{i}.pkl')
    data['malicious'].to_pickle(f'y_S{i}.pkl')

    print(f"Extracted to X_S{i}.pkl and y_S{i}.pkl\n")

Extracted to X_S1.pkl and y_S1.pkl

Extracted to X_S2.pkl and y_S2.pkl

Extracted to X_S3.pkl and y_S3.pkl

Extracted to X_S4.pkl and y_S4.pkl

