In [None]:
# The input dataset is the original Cert Insider Threat Dataset
# We are using the latest r6.2 dataset

# This script drops the content column which saves a lot a memory during calculating attributes
# We will use http content later to develop a different model
# Also, we added a 'day' column which allows us to group the data by day as we are calculating most of the attributes for each single day
# We are also adding is_working_hour column

# For this dataset calculation, I used HPC Cluster (Magnolia) from University of Southern Mississippi
# In HPC clusters, I used Slrum Workload Manager, the script for which is also discussed somewhere in the repo

import pandas as pd
import os
from multiprocessing import Pool

In [None]:
file_path = '../../r6.2/http.csv'  # Path to your Email dataset
output_file = 'reduced_http_dataset.csv'  # Output file
temp_folder_combined = 'temp_combined'  # Temporary folder to store intermediate result files
chunk_size = 500000  # Define an appropriate chunk size

In [None]:
# To make sure that the folder exists
os.makedirs(temp_folder_combined, exist_ok=True)

In [None]:
working_hours_start = pd.to_datetime('09:00').time()
working_hours_end = pd.to_datetime('17:00').time()

In [None]:
def is_working_hours(timestamp):
    time = timestamp.time()
    # Check if it's a weekday (Monday: 0, Tuesday: 1, ..., Friday: 4)
    if timestamp.weekday() in range(0, 5):
        # Check if it's working hours or not
        if working_hours_start <= time <= working_hours_end:
            return True
    return False  # It's not a weekday or not within working hours

In [1]:
# A function to combine 'to', 'cc', and 'bcc' columns into 'recipients' column
def combine_recipients(args):
    chunk, index = args

    # Drop the 'content' column if it exists
    if 'content' in chunk.columns:
        chunk = chunk.drop(columns=['content'])

    chunk['date'] = pd.to_datetime(chunk['date'], format='mixed')
    chunk['is_working_hour'] = chunk['date'].apply(is_working_hours)
    chunk['day'] = pd.to_datetime(chunk['date']).dt.date  # Extracting the date without time
    chunk.to_csv(f'{temp_folder_combined}/temp_combined_{index}.csv', index=False)

In [None]:
# Process each chunk and save it into a temporary file in parallel
with Pool() as pool:
    chunks = [(chunk, i) for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size))]
    pool.map(combine_recipients, chunks)

# Merge all temporary files into one sorted file
all_temp_files = [f'{temp_folder_combined}/temp_combined_{i}.csv' for i in range(len(chunks))]
combined_data = pd.concat([pd.read_csv(f) for f in all_temp_files], ignore_index=True)
combined_data.to_csv(output_file, index=False)

In [None]:
# Clean up temporary files
for f in all_temp_files:
    os.remove(f)