In [58]:
# The input dataset should already have a 'day' and 'is_working_hour column' defined which we already did.
# We are using the chunks that we generated using the email_chunk_generator file

# The output dataset has eight columns, 2 of which are user and day and 6 are calculated using this script
# This script calculate four attributes from the email cert dataset. It generates the following attribute for each user for every single day
# numAttachmentDay
# numAttachmentNight
# numEmailSentwithAttachDay
# numEmailSentwithAttachNight
# numEmailRecievedwithAttachDay
# numEmailRecievedwithAttachNight

# For this dataset calculation, I used HPC Cluster (Magnolia) from University of Southern Mississippi
# In HPC clusters, I used Slrum Workload Manager, the script for which is also discussed somewhere in the repo

import pandas as pd
from multiprocessing import Pool
import os

In [59]:
temp_folder_results = 'temp_results'  # Temporary folder to store intermediate result files
temp_folder_chunks= 'temp_chunks'  # Folder that stores intermediate chunked files
output_file = 'with_attachment_counts.csv'

In [60]:
# To make sure that the folder exists
os.makedirs(temp_folder_results, exist_ok=True)

In [97]:
def calculate_attachment_counts(chunk_filename):
    df = pd.read_csv(f'{temp_folder_chunks}/{chunk_filename}')

    # Filter data for Sent activities

    total_attachments_day = df[df['is_working_hour']].groupby(['user','day'])['attachment_count'].sum().reset_index().rename(columns={'attachment_count': 'numAttachmentDay'})
    total_attachments_night = df[~df['is_working_hour']].groupby(['user','day'])['attachment_count'].sum().reset_index().rename(columns={'attachment_count': 'numAttachmentNight'})
    
    sent_data = df[df['activity'] == 'Send']
    recieved_data = df[df['activity'] == 'View']

    # Separate the data for working and non-working hours
    sent_working_hours = sent_data[sent_data['is_working_hour']]
    sent_off_hours = sent_data[~sent_data['is_working_hour']]
    
    recieved_working_hours = recieved_data[recieved_data['is_working_hour']]
    recieved_off_hours = recieved_data[~recieved_data['is_working_hour']]

    # Calculate the total number of attachments for each user on a specific day for working hours
    sent_attachment_counts_working_hours = sent_working_hours.groupby(['user','day'])['attachment_count'].sum().reset_index().rename(columns={'attachment_count': 'numEmailSentwithAttachDay'})
    # Calculate the total number of attachments for each user on a specific day for non-working hours
    sent_attachment_counts_off_hours = sent_off_hours.groupby(['user','day'])['attachment_count'].sum().reset_index().rename(columns={'attachment_count': 'numEmailSentwithAttachNight'})

    # Calculate the total number of attachments for each user on a specific day for working hours
    recieved_attachment_counts_working_hours = recieved_working_hours.groupby(['user','day'])['attachment_count'].sum().reset_index().rename(columns={'attachment_count': 'numEmailRecievedwithAttachDay'})
    # Calculate the total number of attachments for each user on a specific day for non-working hours
    recieved_attachment_counts_off_hours = recieved_off_hours.groupby(['user','day'])['attachment_count'].sum().reset_index().rename(columns={'attachment_count': 'numEmailRecievedwithAttachNight'})

    

    
    # Merge the results for working and non-working hours with specific suffixes
    sent_merged_results = sent_attachment_counts_working_hours.merge(sent_attachment_counts_off_hours, on=['user','day'], how='outer', suffixes=('_working_hours', '_off_hours'))
    sent_merged_results.fillna(0, inplace=True)  # Replace NaN with 0
    # sent_merged_results[['numEmailSentwithAttachDay','numEmailSentwithAttachNight']] = sent_merged_results[['numEmailSentwithAttachDay','numEmailSentwithAttachNight']].astype(int)
    
    recieve_merged_results = recieved_attachment_counts_working_hours.merge(recieved_attachment_counts_off_hours, on=['user','day'], how='outer', suffixes=('_working_hours', '_off_hours'))
    recieve_merged_results.fillna(0, inplace=True)  # Replace NaN with 0
    # recieve_merged_results[['numEmailRecievedwithAttachDay','numEmailRecievedwithAttachNight']] = recieve_merged_results[['numEmailRecievedwithAttachDay','numEmailRecievedwithAttachNight']].astype(int)

    merged_results = sent_merged_results.merge(recieve_merged_results, on=['user', 'day'], how='outer')
    merged_results.fillna(0, inplace=True)

    # Merge the calculated totals with the 'merged_results' DataFrame
    merged_results = merged_results.merge(total_attachments_day, on=['user', 'day'], how='outer')
    merged_results = merged_results.merge(total_attachments_night, on=['user', 'day'], how='outer')
    merged_results.fillna(0, inplace=True)
    merged_results[['numEmailRecievedwithAttachDay','numEmailRecievedwithAttachNight','numEmailSentwithAttachDay','numEmailSentwithAttachNight','numAttachmentDay','numAttachmentNight']] = merged_results[['numEmailRecievedwithAttachDay','numEmailRecievedwithAttachNight','numEmailSentwithAttachDay','numEmailSentwithAttachNight','numAttachmentDay','numAttachmentNight']].astype(int)

    temp_filename = f'{temp_folder_results}/temp_result_{chunk_filename}'

    # # Save the results into a CSV file
    merged_results.to_csv(temp_filename, index=False)
    
    return temp_filename


In [99]:
# First we have to get the list of chunks that we have in the chunks folder
file_names = os.listdir(temp_folder_chunks)
# Filter only files (not directories)
chunk_filenames = [file for file in file_names if os.path.isfile(os.path.join(temp_folder_chunks, file))]

# Calculate the number of emails sent and received during day and night for each chunk
with Pool() as pool:
    result_filenames = pool.map(calculate_attachment_counts, chunk_filenames)

# Since our results are divided into different files for each day, we have to combine them
combined_result = pd.concat([pd.read_csv(filename) for filename in result_filenames])

# Save the final result to a CSV file
combined_result.to_csv(output_file, index=False)

In [None]:
# # To remove the generated temp files
# for filename in result_filenames:
#     if os.path.exists(filename):  # Check if the file exists before removing
#         os.remove(filename)
#     else:
#         print(f"File {filename} not found.")