In [5]:
# The input dataset should already have a 'day' and 'is_working_hour column' defined which we already did.
# We are also using the chunks that we generated using the email_chunk_generator file

# The output dataset has six columns, 2 of which are user and day.
# This script calculate four attributes from the email cert dataset. It generates the following attribute for each user for every single day
# numSendDay
# numSendNight
# numReceivedDay
# numReceivedNight

# For this dataset calculation, I used HPC Cluster (Magnolia) from University of Southern Mississippi
# In HPC clusters, I used Slrum Workload Manager, the script for which is also discussed somewhere in the repo

import pandas as pd
from multiprocessing import Pool
import os

In [6]:
temp_folder_results = 'temp_results'  # Temporary folder to store intermediate result files
temp_folder_chunks= 'temp_chunks'  # Folder that stores intermediate chunked files
output_file = 'with_sent_recieved_counts.csv'

In [7]:
# To make sure that the folder exists
os.makedirs(temp_folder_results, exist_ok=True)

In [None]:
# Function to calculate the number of emails sent and received during day and night for each user in a day
def calculate_emails_counts(chunk_filename):
    df = pd.read_csv(f'{temp_folder_chunks}/{chunk_filename}')

    # Filter data for Sent activities
    sent_data = df[df['activity'] == 'Send']
    received_data = df[df['activity'] == 'View']

    # Calculate the number of sent emails during the day and night for each user on a specific day
    sent_data = sent_data.assign(
        numSendDay=sent_data['is_working_hour'].astype(int),
        numSendNight=(~sent_data['is_working_hour']).astype(int),
    )    

    # Calculate the number of received emails during the day and night for each user on a specific day
    received_data = received_data.assign(
        numReceivedDay=received_data['is_working_hour'].astype(int),
        numReceivedNight=(~received_data['is_working_hour']).astype(int)
    )

    sent_counts = sent_data.groupby(['user', 'day']).agg(
        numSendDay=('numSendDay', 'sum'),
        numSendNight=('numSendNight', 'sum')
    ).reset_index()

    received_counts = received_data.groupby(['user', 'day']).agg(
        numReceivedDay=('numReceivedDay', 'sum'),
        numReceivedNight=('numReceivedNight', 'sum')
    ).reset_index()

    # Merge sent and received data
    merged_results = sent_counts.merge(received_counts, on=['user', 'day'], how='outer')
    merged_results.fillna(0, inplace=True)  # Replace NaN with 0
    merged_results[['numSendDay','numSendNight','numReceivedDay','numReceivedNight']] = merged_results[['numSendDay','numSendNight','numReceivedDay','numReceivedNight']].astype(int)

    temp_filename = f'{temp_folder_results}/temp_result_{chunk_filename}'
    
    # Because we are processing different chunks, a separate result file is generated for each chunk in temp_folder_chunks folder
    merged_results.to_csv(temp_filename, index=False)
    
    return temp_filename  # Return the filename of the saved result

In [None]:
# First we have to get the list of chunks that we have in the chunks folder
file_names = os.listdir(temp_folder_chunks)
# Filter only files (not directories)
chunk_filenames = [file for file in file_names if os.path.isfile(os.path.join(temp_folder_chunks, file))]

# Calculate the number of emails sent and received during day and night for each chunk
with Pool() as pool:
    result_filenames = pool.map(calculate_emails_counts, chunk_filenames)

# Since our results are divided into different files for each day, we have to combine them
combined_result = pd.concat([pd.read_csv(filename) for filename in result_filenames])

# Save the final result to a CSV file
combined_result.to_csv(output_file, index=False)

In [None]:
# To remove the generated temp files
for filename in result_filenames:
    if os.path.exists(filename):  # Check if the file exists before removing
        os.remove(filename)
    else:
        print(f"File {filename} not found.")