In [1]:
# The input dataset should already have a 'day' and 'is_working_hour column' defined which we already did. It should also have already combined all the emails from to cc and bcc column separated by semicolons (;)
# We are using the chunks that we generated using the email_chunk_generator file

# The output dataset has six columns, 2 of which are user and day and 4 are calculated using this script
# This script calculate four attributes from the email cert dataset. It generates the following attribute for each user for every single day
# numdistinctRecipientsDay
# numdistinctRecipientsNight
# numinternalRecipientsDay
# numinternalRecipientsNight

# For this dataset calculation, I used HPC Cluster (Magnolia) from University of Southern Mississippi
# In HPC clusters, I used Slrum Workload Manager, the script for which is also discussed somewhere in the repo

import pandas as pd
from multiprocessing import Pool
import os
import re

In [76]:
temp_folder_results = 'temp_results'  # Temporary folder to store intermediate result files
temp_folder_chunks= 'temp_chunks'  # Folder that stores intermediate chunked files
output_file = 'with_distinct_internal_counts.csv'
internal_domain = 'dtaa'

In [77]:
# To make sure that the folder exists
os.makedirs(temp_folder_results, exist_ok=True)

In [80]:
def calculate_distinct_internal_counts(chunk_filename):
    chunk = pd.read_csv(f'{temp_folder_chunks}/{chunk_filename}')

    # Split recipients by semicolons and remove empty strings
    chunk['recipients'] = chunk['recipients'].str.split(';').apply(lambda x: [email.strip() for email in x if email.strip()])

    # Group by user and day
    grouped = chunk.groupby(['user', 'day'])

    # Calculate distinct emails for office hours and off-hours
    results = []
    for (user, day), group in grouped:
        office_hours = group[group['is_working_hour']]
        off_hours = group[~group['is_working_hour']]

        # Count distinct emails for office hours and off-hours
        distinct_office_hours = len(set(email for sublist in office_hours['recipients'] for email in sublist))
        distinct_off_hours = len(set(email for sublist in off_hours['recipients'] for email in sublist))

        # Count emails with 'dtaa' domain for office hours and off-hours
        dtaa_office_hours = sum(any(internal_domain in email for email in sublist) for sublist in office_hours['recipients'])
        dtaa_off_hours = sum(any(internal_domain in email for email in sublist) for sublist in off_hours['recipients'])

        results.append({
            'user': user,
            'day': day,
            'numdistinctRecipientsDay': distinct_office_hours,
            'numdistinctRecipientsNight': distinct_off_hours,
            'numinternalRecipientsDay': dtaa_office_hours,
            'numinternalRecipientsNight': dtaa_off_hours
        })

    # Create a DataFrame from the results
    result_df = pd.DataFrame(results)



    # Save processed data to a new file in the temp folder
    temp_filename = f"{temp_folder_results}/processed_{chunk_filename}"
    # chunk[['day', 'user', 'numDistinctRecipientsDay', 'numDistinctRecipientsNight','numInternalRecipientsDay','numInternalRecipientsNight']].to_csv(temp_filename, index=False)
    result_df.to_csv(temp_filename, index=False)
    return temp_filename

In [81]:
# First we have to get the list of chunks that we have in the chunks folder
file_names = os.listdir(temp_folder_chunks)
# # Filter only files (not directories)
chunk_filenames = [file for file in file_names if os.path.isfile(os.path.join(temp_folder_chunks, file))]

# Calculate the number of emails sent and received during day and night for each chunk
with Pool() as pool:
    result_filenames = pool.map(calculate_distinct_internal_counts, chunk_filenames)

# Since our results are divided into different files for each day, we have to combine them
combined_result = pd.concat([pd.read_csv(filename) for filename in result_filenames])

# Save the final result to a CSV file
print(combined_result)

combined_result.to_csv(output_file, index=False)

         user         day  numdistinctRecipientsDay  \
0     AAS3428  2010-01-09                         0   
1     ABK3081  2010-01-09                         0   
2     ABM3641  2010-01-09                         0   
3     ABP2917  2010-01-09                         0   
4     ACH1910  2010-01-09                         0   
...       ...         ...                       ...   
3965  ZVB2656  2010-01-12                         9   
3966  ZVS1637  2010-01-12                         5   
3967  ZWS3625  2010-01-12                         2   
3968  ZXM3086  2010-01-12                        21   
3969  ZZO2997  2010-01-12                        29   

      numdistinctRecipientsNight  numinternalRecipientsDay  \
0                             32                         0   
1                             15                         0   
2                              2                         0   
3                             10                         0   
4                            

In [None]:
# To remove the generated temp files
for filename in result_filenames:
    if os.path.exists(filename):  # Check if the file exists before removing
        os.remove(filename)
    else:
        print(f"File {filename} not found.")