In [1]:
# The input dataset should already have a 'day' and 'is_working_hour column' defined which we already did.
# We are also using the chunks that we generated using the email_chunk_generator file

# The output dataset has four columns, 2 of which are user and day.
# This script calculate four attributes from the email cert dataset. It generates the following attribute for each user for every single day
# numURLAccessedDay
# numURLAccessedNight

# For this dataset calculation, I used HPC Cluster (Magnolia) from University of Southern Mississippi
# In HPC clusters, I used Slrum Workload Manager, the script for which is also discussed somewhere in the repo

import pandas as pd
from multiprocessing import Pool
import os

In [2]:
temp_folder_results = 'temp_results'  # Temporary folder to store intermediate result files
temp_folder_chunks= 'temp_chunks'  # Folder that stores intermediate chunked files
output_file = 'with_url_counts.csv'

In [3]:
# To make sure that the folder exists
os.makedirs(temp_folder_results, exist_ok=True)

In [8]:
# Function to calculate the number of emails sent and received during day and night for each user in a day
def calculate_activity_counts(chunk_filename):
    df = pd.read_csv(f'{temp_folder_chunks}/{chunk_filename}')

    df['is_working_hour'] = df['is_working_hour'].astype(bool)

    day_unique_urls = df[df['is_working_hour']].groupby(['user', 'day'])['url'].nunique().reset_index(name='numURLAccessedDay')
    night_unique_urls = df[~df['is_working_hour']].groupby(['user', 'day'])['url'].nunique().reset_index(name='numURLAccessedNight')

    merged_data = day_unique_urls.merge(night_unique_urls, on=['user', 'day'], how='outer')
    merged_data.fillna(0, inplace=True)  # Replace NaN with 0
    merged_data[['numURLAccessedDay','numURLAccessedNight']] = merged_data[['numURLAccessedDay','numURLAccessedNight']].astype(int)
    
    temp_filename = f'{temp_folder_results}/temp_result_{chunk_filename}'
    
    # # Because we are processing different chunks, a separate result file is generated for each chunk in temp_folder_chunks folder
    merged_data.to_csv(temp_filename, index=False)
    
    return temp_filename  # Return the filename of the saved result


In [9]:
# First we have to get the list of chunks that we have in the chunks folder
file_names = os.listdir(temp_folder_chunks)
# Filter only files (not directories)
chunk_filenames = [file for file in file_names if os.path.isfile(os.path.join(temp_folder_chunks, file))]

# chunk_filenames = ['temp_chunk_2010-01-09.csv','temp_chunk_2010-01-12.csv']

# Calculate the number of emails sent and received during day and night for each chunk
with Pool() as pool:
    result_filenames = pool.map(calculate_activity_counts, chunk_filenames)

# Since our results are divided into different files for each day, we have to combine them
combined_result = pd.concat([pd.read_csv(filename) for filename in result_filenames])

# Save the final result to a CSV file
combined_result.to_csv(output_file, index=False)

Unnamed: 0,numURLAccessedDay,user,day,numURLAccessedNight
0,0,AAS3428,2010-01-09,48
1,0,ABK3081,2010-01-09,32
2,0,ABM3641,2010-01-09,10
3,0,ABP2917,2010-01-09,16
4,0,ACH1910,2010-01-09,32
...,...,...,...,...
3995,14,ZVS1637,2010-01-12,4
3996,6,ZWS3625,2010-01-12,2
3997,7,ZXM3086,2010-01-12,8
3998,16,ZZO2997,2010-01-12,12


In [None]:
# To remove the generated temp files
for filename in result_filenames:
    if os.path.exists(filename):  # Check if the file exists before removing
        os.remove(filename)
    else:
        print(f"File {filename} not found.")