In [2]:
import pandas as pd
from pandas import Timedelta

# Load datasets
logon_df = pd.read_csv('logon_cleaned.csv')
email_df = pd.read_csv('email_cleaned.csv')
file_df = pd.read_csv('file_cleaned.csv')  # Load file events data
device_df=pd.read_csv('device_cleaned.csv')
# Rename user_id to user if necessary
logon_df.rename(columns={'user_id': 'user'}, inplace=True)

logon_df.rename(columns={'pc': 'logon_pc'}, inplace=True)
file_df.rename(columns={'pc': 'file_pc'}, inplace=True)
device_df.rename(columns={'pc': 'device_pc'}, inplace=True)

In [3]:

# Convert to datetime
email_df["date"] = pd.to_datetime(email_df["date"])

file_df["date"] = pd.to_datetime(file_df["date"])  # Ensure correct format
device_df['date']=pd.to_datetime(file_df['date'])


logon_df['start_time'] = pd.to_datetime(logon_df['start_time'])
logon_df['end_time'] = pd.to_datetime(logon_df['end_time'])



In [4]:
print(email_df.columns)
print(logon_df.columns)
print(file_df.columns)
print(device_df.columns)

Index(['date', 'user', 'pc', 'int_to_int_mails', 'int_to_out_mails',
       'out_to_int_mails', 'out_to_out_mails', 'internal_recipients',
       'external_recipients', 'distinct_bcc', 'mails_with_attachments',
       'after_hour_mails'],
      dtype='object')
Index(['user', 'logon_pc', 'employee_name', 'role', 'start_time', 'end_time',
       'logon_hour', 'day_of_week', 'logon_on_own_pc', 'logon_on_other_pc',
       'logon_on_own_pc_normal', 'logon_on_own_pc_off_hour',
       'logon_on_other_pc_normal', 'logon_on_other_pc_off_hour'],
      dtype='object')
Index(['date', 'user', 'file_pc', 'documents_copy_own_pc',
       'documents_copy_other_pc', 'program_files_copy_own_pc',
       'program_files_copy_other_pc', 'documents_copy_own_pc_off_hour',
       'documents_copy_other_pc_off_hour',
       'program_files_copy_own_pc_off_hour',
       'program_files_copy_other_pc_off_hour'],
      dtype='object')
Index(['date', 'user', 'device_pc', 'day_of_week',
       'device_connects_on_own_pc

In [5]:
print(email_df.shape)
print(logon_df.shape)
print(file_df.shape)
print(device_df.shape)

(2612453, 12)
(366340, 14)
(439114, 11)
(405019, 8)


In [6]:
email_df.head()

Unnamed: 0,date,user,pc,int_to_int_mails,int_to_out_mails,out_to_int_mails,out_to_out_mails,internal_recipients,external_recipients,distinct_bcc,mails_with_attachments,after_hour_mails
0,2010-01-04 08:19:15,AAE0190,PC-8915,1,0,0,0,1,0,0,0,0
1,2010-01-04 08:19:50,AAE0190,PC-8915,1,0,0,0,3,0,0,0,0
2,2010-01-04 08:20:14,AAE0190,PC-8915,1,0,0,0,2,0,0,0,0
3,2010-01-04 08:29:44,AAE0190,PC-8915,1,0,0,0,1,0,0,0,0
4,2010-01-04 08:29:47,AAE0190,PC-8915,1,0,0,0,3,0,0,1,0


In [None]:
logon_df.head()

In [None]:
file_df.head()

In [None]:
device_df.head()

In [None]:
logon_df['user'].nunique()

In [None]:
email_df['user'].nunique()

In [None]:
print(logon_df.shape)
print(email_df.shape)

In [8]:
logon_df.rename(columns={
    'own_pc':'pc'
},inplace=True)

In [None]:
logon_df.columns

In [None]:
merged_df.shape

In [None]:
merged_df.columns

In [None]:
email_df = email_df.query("start_time <= date <= end_time")
file_df = file_df.query("start_time <= date <= end_time")
device_df = device_df.query("start_time <= date <= end_time")


In [None]:


# Aggregate Email Features
email_agg = email_df.groupby(["user", "date"]).agg({
    "int_to_int_mails": "sum",
    "int_to_out_mails": "sum",
    "out_to_int_mails": "sum",
    "out_to_out_mails": "sum",
    "internal_recipients": "sum",
    "external_recipients": "sum",
    "distinct_bcc": "sum",
    "mails_with_attachments": "sum",
    "after_hour_mails": "sum"
}).reset_index()

# Aggregate File Features
file_agg = file_df.groupby(["user", "date"]).agg({
    "documents_copy_own_pc": "sum",
    "documents_copy_other_pc": "sum",
    "program_files_copy_own_pc": "sum",
    "program_files_copy_other_pc": "sum",
    "documents_copy_own_pc_off_hour": "sum",
    "documents_copy_other_pc_off_hour": "sum",
    "program_files_copy_own_pc_off_hour": "sum",
    "program_files_copy_other_pc_off_hour": "sum"
}).reset_index()

# Aggregate Device Features
device_agg = device_df.groupby(["user", "date"]).agg({
    "device_connects_on_own_pc_normal_hour": "sum",
    "device_connects_on_other_pc_normal_hour": "sum",
    "device_connects_on_own_pc_off_hour": "sum",
    "device_connects_on_other_pc_off_hour": "sum"
}).reset_index()



In [None]:
# Merge aggregated data
feature_vector = feature_vector.merge(email_agg, on=["user", "date"], how="left") \
                               .merge(file_agg, on=["user", "date"], how="left") \
                               .merge(device_agg, on=["user", "date"], how="left")

# Fill NaN values with 0 (for sessions without email, file, or device activity)
#feature_vector.fillna(0, inplace=True)

# Compute session duration
feature_vector["session_duration"] = (feature_vector["end_time"] - feature_vector["start_time"]).dt.total_seconds()


In [None]:
print("Logon Columns:", logon_df.columns)
print("Email Columns:", email_df.columns)
print("File Columns:", file_df.columns)
print("Device Columns:", device_df.columns)


In [None]:

# Drop unnecessary columns
#feature_vector.drop(columns=["start_time", "end_time"], inplace=True)

# Save the processed feature vector
#feature_vector.to_csv("feature_vector.csv", index=False)

print("Feature vector generated and saved as 'feature_vector.csv' successfully!")


ModuleNotFoundError: No module named 'tensorflow'