In [1]:
# This file generates a csv that is used for Email Content Analysis

import pandas as pd
import os
from multiprocessing import Pool

In [2]:
file_path = '../../r6.2/email.csv'  # Path to your Email dataset
output_file = 'reduced_email_content_dataset.csv'  # Output file
temp_folder_combined = 'temp_combined'  # Temporary folder to store intermediate result files
chunk_size = 500000  # Define an appropriate chunk size

In [3]:
# To make sure that the folder exists
os.makedirs(temp_folder_combined, exist_ok=True)

In [4]:
# A function to combine 'to', 'cc', and 'bcc' columns into 'recipients' column
def drop_unnecessary(args):
    chunk, index = args

    # Drop the 'content' column if it exists
    if 'to' in chunk.columns:
        chunk = chunk.drop(columns=['to'])

    if 'cc' in chunk.columns:
        chunk = chunk.drop(columns=['cc'])
    if 'bcc' in chunk.columns:
        chunk = chunk.drop(columns=['bcc'])
    if 'from' in chunk.columns:
        chunk = chunk.drop(columns=['from'])
    if 'activity' in chunk.columns:
        chunk = chunk.drop(columns=['activity'])
    if 'size' in chunk.columns:
        chunk = chunk.drop(columns=['size'])
    if 'attachments' in chunk.columns:
        chunk = chunk.drop(columns=['attachments'])
    if 'recipients' in chunk.columns:
        chunk = chunk.drop(columns=['recipients'])
        
    chunk.to_csv(f'{temp_folder_combined}/temp_combined_{index}.csv', index=False)

In [None]:
# Process each chunk and save it into a temporary file in parallel
with Pool() as pool:
    chunks = [(chunk, i) for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size))]
    pool.map(drop_unnecessary, chunks)

# Merge all temporary files into one sorted file
all_temp_files = [f'{temp_folder_combined}/temp_combined_{i}.csv' for i in range(len(chunks))]
combined_data = pd.concat([pd.read_csv(f) for f in all_temp_files], ignore_index=True)
combined_data.to_csv(output_file, index=False)

In [None]:
# Clean up temporary files
for f in all_temp_files:
    os.remove(f)