In [23]:
import os
import pandas as pd

# Directory where CSV files are stored
csv_directory = '/Users/aimee/Documents/College/Courses/S24/17-735/17735-project/ExtractedData'

# Lists to hold data from each file
logon_data = []
logoff_data = []

# Iterate over each CSV file in the directory
for filename in os.listdir(csv_directory):
    if filename.endswith('.csv'):
        # Full path to the CSV file
        csv_path = os.path.join(csv_directory, filename)

        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_path, names=['User', 'Timestamp', 'PC', 'ActivityType', 'Action'])

        # Convert Timestamp to datetime
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%m/%d/%Y %H:%M:%S')

        # Extract time and hour from Timestamp
        df['Time'] = df['Timestamp'].dt.time
        df['Hour'] = df['Timestamp'].dt.hour

        # Append DataFrame to the list by action
        logon_data.append(df[df['Action'] == 'Logon'])
        logoff_data.append(df[df['Action'] == 'Logoff'])

# Concatenate all logon and logoff data into two separate DataFrames
logon_df = pd.concat(logon_data)
logoff_df = pd.concat(logoff_data)

# Group by 'User' and aggregate times for logon
df_user_logon_stats = logon_df.groupby('User')['Time'].agg(['min', 'max']).reset_index()
df_logon_mode = logon_df.groupby('User')['Time'].agg(lambda x: x.value_counts().index[0]).reset_index()
df_logon_mean = logon_df.groupby('User')['Hour'].mean().reset_index()

# Convert mean hour to int and then to time
df_logon_mean['Hour'] = df_logon_mean['Hour'].astype(int)
df_logon_mean['Hour'] = pd.to_datetime(df_logon_mean['Hour'], format='%H').dt.time

# Add mode and mean to the logon stats DataFrame
df_user_logon_stats['mode'] = df_logon_mode['Time']
df_user_logon_stats['mean'] = df_logon_mean['Hour']

# Group by 'User' and aggregate times for logoff
df_user_logoff_stats = logoff_df.groupby('User')['Time'].agg(['min', 'max']).reset_index()
df_logoff_mode = logoff_df.groupby('User')['Time'].agg(lambda x: x.value_counts().index[0]).reset_index()
df_logoff_mean = logoff_df.groupby('User')['Hour'].mean().reset_index()

# Convert mean hour to int and then to time
df_logoff_mean['Hour'] = df_logoff_mean['Hour'].astype(int)
df_logoff_mean['Hour'] = pd.to_datetime(df_logoff_mean['Hour'], format='%H').dt.time

# Add mode and mean to the logoff stats DataFrame
df_user_logoff_stats['mode'] = df_logoff_mode['Time']
df_user_logoff_stats['mean'] = df_logoff_mean['Hour']


In [24]:
df_user_logon_stats

Unnamed: 0,User,min,max,mode,mean
0,DTAA/AAA0371,07:35:01,13:46:57,07:54:21,09:00:00
1,DTAA/AAC0344,07:50:03,08:09:53,08:06:07,07:00:00
2,DTAA/AAC0599,07:50:03,08:09:58,07:56:28,07:00:00
3,DTAA/AAH0734,08:20:05,08:39:58,08:26:55,08:00:00
4,DTAA/AAK0658,08:20:00,08:39:54,08:25:59,08:00:00
...,...,...,...,...,...
995,DTAA/ZGH0528,08:35:06,14:54:12,08:36:42,10:00:00
996,DTAA/ZKE0662,07:20:03,13:36:39,07:32:54,09:00:00
997,DTAA/ZKH0388,06:35:00,16:35:13,06:39:22,08:00:00
998,DTAA/ZKN0548,08:20:05,08:39:46,08:25:35,08:00:00


In [25]:
df_user_logoff_stats

Unnamed: 0,User,min,max,mode,mean
0,DTAA/AAA0371,08:33:18,14:54:58,14:48:22,13:00:00
1,DTAA/AAC0344,15:55:00,16:09:58,16:09:07,15:00:00
2,DTAA/AAC0599,15:55:01,16:09:53,16:04:27,15:00:00
3,DTAA/AAH0734,17:25:00,17:39:59,17:34:26,17:00:00
4,DTAA/AAK0658,16:25:05,16:39:59,16:38:32,16:00:00
...,...,...,...,...,...
995,DTAA/ZGH0528,17:40:01,17:54:58,17:50:07,17:00:00
996,DTAA/ZKE0662,19:25:02,19:39:59,19:38:17,19:00:00
997,DTAA/ZKH0388,07:34:06,17:54:59,17:50:09,16:00:00
998,DTAA/ZKN0548,18:25:01,18:39:54,18:32:07,18:00:00


In [28]:
import numpy as np
def dtt2timestamp(dtt):
  time_in_sec = (dtt.hour*60 + dtt.minute) * 60 + dtt.second
  return time_in_sec

df_user_logon_stats_sec = df_user_logon_stats

time_columns = ['min', 'max', 'mode', 'mean']
for col in time_columns:
    df_user_logon_stats_sec[col] = pd.to_datetime(df_user_logon_stats[col], format='%H:%M:%S').dt.time


min_ts = [dtt2timestamp(dtt) for dtt in df_user_logon_stats_sec['min']]
max_ts = [dtt2timestamp(dtt) for dtt in df_user_logon_stats_sec['max']]
mode_ts = [dtt2timestamp(dtt) for dtt in df_user_logon_stats_sec['mode']]
mean_ts = [dtt2timestamp(dtt) for dtt in df_user_logon_stats_sec['mean']]

df_user_logon_stats_sec['min_ts'] = min_ts
df_user_logon_stats_sec['max_ts'] = max_ts
df_user_logon_stats_sec['mode_ts'] = mode_ts
df_user_logon_stats_sec['mean_ts'] = mean_ts


df_user_logon_stats_sec.drop(['min','max','mode','mean'], axis=1)

df_user_logoff_stats_sec = df_user_logoff_stats

for col in time_columns:
    df_user_logoff_stats_sec[col] = pd.to_datetime(df_user_logoff_stats[col], format='%H:%M:%S').dt.time


min_ts = [dtt2timestamp(dtt) for dtt in df_user_logoff_stats_sec['min']] 
max_ts = [dtt2timestamp(dtt) for dtt in df_user_logoff_stats_sec['max']]
mode_ts = [dtt2timestamp(dtt) for dtt in df_user_logoff_stats_sec['mode']]
mean_ts = [dtt2timestamp(dtt) for dtt in df_user_logoff_stats_sec['mean']]


df_user_logoff_stats_sec['min_ts'] = min_ts
df_user_logoff_stats_sec['max_ts'] = max_ts
df_user_logoff_stats_sec['mode_ts'] = mode_ts
df_user_logoff_stats_sec['mean_ts'] = mean_ts


df_user_logoff_stats_sec.drop(['min', 'max','mode','mean'], axis=1)


df_log_on_off_stats = pd.DataFrame()

df_log_on_off_stats['User'] = df_user_logon_stats_sec['User']
df_log_on_off_stats['on_min_ts'] = df_user_logon_stats_sec['min_ts']
df_log_on_off_stats['on_max_ts'] = df_user_logon_stats_sec['max_ts']
df_log_on_off_stats['on_mode_ts'] = df_user_logon_stats_sec['mode_ts']
df_log_on_off_stats['on_mean_ts'] = df_user_logon_stats_sec['mean_ts']
df_log_on_off_stats['off_min_ts'] = df_user_logon_stats_sec['min_ts']
df_log_on_off_stats['off_max_ts'] = df_user_logon_stats_sec['max_ts']
df_log_on_off_stats['off_mode_ts'] = df_user_logon_stats_sec['mode_ts']
df_log_on_off_stats['off_mean_ts'] = df_user_logon_stats_sec['mean_ts']


#df_log_on_off_stats.dtypes

log_stats = df_log_on_off_stats.drop(['User'], axis=1)
log_stats_array = np.array(log_stats)  # Changed from np.matrix to np.array
print(log_stats)

     on_min_ts  on_max_ts  on_mode_ts  on_mean_ts  off_min_ts  off_max_ts  \
0        27301      49617       28461       32400       27301       49617   
1        28203      29393       29167       25200       28203       29393   
2        28203      29398       28588       25200       28203       29398   
3        30005      31198       30415       28800       30005       31198   
4        30000      31194       30359       28800       30000       31194   
..         ...        ...         ...         ...         ...         ...   
995      30906      53652       31002       36000       30906       53652   
996      26403      48999       27174       32400       26403       48999   
997      23700      59713       23962       28800       23700       59713   
998      30005      31186       30335       28800       30005       31186   
999      32702      33894       33569       32400       32702       33894   

     off_mode_ts  off_mean_ts  
0          28461        32400  
1          

In [44]:
from sklearn.model_selection import train_test_split
# train, test = train_test_split(log_stats_array, test_size=0.2)
train = log_stats_array
test = log_stats_array

In [63]:
import numpy as np
import math

class IsolationForestCustom:
    def __init__(self, n_estimators=100, max_samples='auto', max_depth=1, contamination=0.1, random_state=None, epsilon=0.1):
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_depth = max_depth
        self.contamination = contamination
        self.random_state = random_state
        self.epsilon = epsilon
        self.test_data_size = 0
        self.trees = []

    def fit(self, X):
        num_samples, num_features = X.shape
        self.test_data_size = num_samples
        self.max_samples = self.max_samples if self.max_samples != 'auto' else min(256, num_samples)

        for _ in range(self.n_estimators):
            idx = np.random.choice(num_samples, size=self.max_samples, replace=False)
            tree = self._fit_tree(X[idx], max_depth=self.max_depth)
            self.trees.append(tree)

    def _fit_tree(self, X, max_depth):
        if max_depth <= 0 or len(X) <= 1:
            return {'isLeaf': True,
                    'size': len(X)
            }
        num_samples, num_features = X.shape
        feature_idx = np.random.randint(0, num_features)
        split_val = np.random.uniform(X[:, feature_idx].min(), X[:, feature_idx].max())
        
        left_idxs = X[:, feature_idx] < split_val
        right_idxs = ~left_idxs
        
        left_subtree = self._fit_tree(X[left_idxs], max_depth - 1)
        right_subtree = self._fit_tree(X[right_idxs], max_depth - 1)

        return {
            'isLeaf': False,
            'feature_idx': feature_idx,
            'split_val': split_val,
            'left': left_subtree,
            'right': right_subtree
        }

    def c(self, sample_size):
        if (sample_size < 2):
            return 0
        elif (sample_size == 2):
            return 1
        return 2.0 * (math.log(sample_size - 1) + 0.5772156649) - 2.0 * (sample_size - 1) / self.test_data_size

    def predict(self, X):
        num_samples = len(X)
        anomaly_scores = np.zeros(num_samples)
        
        for i, sample in enumerate(X):
            for j, tree in enumerate(self.trees):
                anomaly_scores[i] += self._path_length(sample, tree)
            anomaly_scores[i] = 2 ** (-anomaly_scores[i] / self.n_estimators / self.c(self.max_samples))

        return anomaly_scores

    def _path_length(self, x, tree):
        current_node = tree  # Start from the root node
        path_length = 0
        
        while True:
            if 'size' in current_node:
                return path_length + self.c(current_node['size'])
            feature_idx = current_node['feature_idx']
            split_val = current_node['split_val']
            
            if x[feature_idx] < split_val:  # Traverse left
                current_node = current_node['left']
            else:  # Traverse right
                current_node = current_node['right']
                    
                    
            path_length += 1
            

# Example usage:
# Assuming X_train is your training data
iforest_custom = IsolationForestCustom(n_estimators=100, contamination=0.1, epsilon=0.1)
iforest_custom.fit(train)

# Assuming X_test is your test data
# Use the trained model to predict anomaly scores for the test data
anomaly_scores = iforest_custom.predict(test)
print(anomaly_scores)

[0.48213125 0.48018294 0.48018294 0.47987699 0.47987699 0.47932073
 0.48032452 0.4824452  0.49124276 0.48018294 0.48192336 0.48146767
 0.48018294 0.50932902 0.49660178 0.4834257  0.48032452 0.48613401
 0.47932073 0.48062785 0.48459517 0.47987699 0.47932073 0.48304656
 0.51418715 0.48144278 0.48387444 0.48306135 0.513792   0.47932073
 0.48283404 0.48018294 0.48304656 0.50985164 0.48018294 0.48018294
 0.49309068 0.52231232 0.48018294 0.48283404 0.51525011 0.49800353
 0.48395701 0.48174803 0.49164034 0.53160957 0.48032452 0.48458983
 0.50412154 0.49660178 0.47987699 0.49473384 0.48203737 0.48018294
 0.48459517 0.52141689 0.52249388 0.48032452 0.4847842  0.47987699
 0.48283404 0.48018294 0.48032452 0.48526307 0.53085345 0.51633617
 0.48753605 0.47987699 0.48018294 0.48270174 0.48032452 0.48018294
 0.48213125 0.49309068 0.48203737 0.47987699 0.48018294 0.48018294
 0.4824452  0.48062785 0.48385178 0.5054324  0.48018294 0.47987699
 0.48191556 0.48213125 0.48018294 0.47987699 0.48018294 0.4830

In [67]:
anomaly_count = 0
norm_count = 0
for score in anomaly_scores:
    if score > 0.5:
        anomaly_count += 1
    else:
        norm_count += 1

In [68]:
anomaly_count

141

In [69]:
norm_count

859

In [71]:
log_ascore = anomaly_scores

df_user_log_result = pd.DataFrame()
df_user_log_result['User'] = df_user_logoff_stats_sec['User']
df_user_log_result['anomaly_score'] = log_ascore
# df_user_log_result.to_csv('user_log_result.csv')
print(df_user_log_result)


outliers = df_user_log_result.loc[df_user_log_result['anomaly_score'] > 0.5]
print(outliers)

             User  anomaly_score
0    DTAA/AAA0371       0.482131
1    DTAA/AAC0344       0.480183
2    DTAA/AAC0599       0.480183
3    DTAA/AAH0734       0.479877
4    DTAA/AAK0658       0.479877
..            ...            ...
995  DTAA/ZGH0528       0.491176
996  DTAA/ZKE0662       0.483500
997  DTAA/ZKH0388       0.487292
998  DTAA/ZKN0548       0.479877
999  DTAA/ZRR0705       0.493407

[1000 rows x 2 columns]
             User  anomaly_score
13   DTAA/ABS0726       0.509329
24   DTAA/AFF0760       0.514187
28   DTAA/AFO0022       0.513792
33   DTAA/AGW0182       0.509852
37   DTAA/AIB0797       0.522312
..            ...            ...
970  DTAA/WTA0867       0.512573
972  DTAA/XCB0445       0.512573
975  DTAA/XIN0791       0.508041
987  DTAA/YQW0689       0.510343
990  DTAA/ZAB0889       0.506698

[141 rows x 2 columns]
