In [11]:
import pandas as pd 
import numpy as np
import re
import os
import math

In [4]:
with open("/Users/bowiechuang/Documents/GitHub/ProjectNexus/backend/emails.csv", "r") as f:
    for i in range(10):
        print(repr(f.readline()))

'"file","message"\n'
'"allen-p/_sent_mail/1.","Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\n'
'Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)\n'
'From: phillip.allen@enron.com\n'
'To: tim.belden@enron.com\n'
'Subject: \n'
'Mime-Version: 1.0\n'
'Content-Type: text/plain; charset=us-ascii\n'
'Content-Transfer-Encoding: 7bit\n'
'X-From: Phillip K Allen\n'


In [6]:

def iter_email_batches(path, batch_size=5000):
    row_start = re.compile(r'^"[^"]+","')
    records = []
    current_file = None
    current_msg_lines = []

    with open(path, "r", encoding="utf-8", errors="replace") as f:
        f.readline()  # skip header

        for line in f:
            if row_start.match(line):
                if current_file is not None:
                    records.append({"file": current_file, "message": "".join(current_msg_lines)})

                first_comma = line.find('","')
                current_file = line[1:first_comma]
                current_msg_lines = [line[first_comma + 3:]]

                if len(records) >= batch_size:
                    yield pd.DataFrame(records)
                    records = []
            else:
                if current_file is not None:
                    current_msg_lines.append(line)

    if current_file is not None:
        records.append({"file": current_file, "message": "".join(current_msg_lines)})

    if records:
        yield pd.DataFrame(records)

# usage:
path = "/Users/bowiechuang/Documents/GitHub/ProjectNexus/backend/emails.csv"
dfs = []
for df_chunk in iter_email_batches(path, batch_size=5000):
    dfs.append(df_chunk)

df_email = pd.concat(dfs, ignore_index=True)

In [7]:
df_email

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
...,...,...
517396,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...
517397,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...
517398,zufferli-j/sent_items/97.,Message-ID: <28979867.1075842029988.JavaMail.e...
517399,zufferli-j/sent_items/98.,Message-ID: <22052556.1075842030013.JavaMail.e...


In [9]:
df_email['message'].iloc[50]

'Message-ID: <27936946.1075855378542.JavaMail.evans@thyme>\nDate: Wed, 2 May 2001 10:27:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tori.kuykendall@enron.com\nSubject: Re: 2- SURVEY - PHILLIP ALLEN\nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tori Kuykendall <Tori Kuykendall/HOU/ECT@ECT>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\n\n---------------------- Forwarded by Phillip K Allen/HOU/ECT on 05/02/2001 05:26 AM ---------------------------\n\n\nIna Rangel\n05/01/2001 12:24 PM\nTo:\tPhillip K Allen/HOU/ECT@ECT\ncc:\t \nSubject:\tRe: 2- SURVEY - PHILLIP ALLEN   \n\n\n\n\n   \n-\nFull Name:        Phillip Allen\n\nLogin ID:  \tpallen\n\nExtension:  3-7041\n\nOffice Location:  EB3210C\n\nWhat type of computer do you have?  (Desktop,  Laptop,  Both)  Both\n\nDo you have a PDA?  If yes, what type d

In [12]:
out_dir = "/Users/bowiechuang/Documents/GitHub/ProjectNexus/backend/email_chunks"
os.makedirs(out_dir, exist_ok=True)

chunk_size = 100_000
n_rows = len(df_email)
n_chunks = math.ceil(n_rows / chunk_size)

for i in range(n_chunks):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, n_rows)

    df_chunk = df_email.iloc[start:end]

    out_path = os.path.join(out_dir, f"emails_part_{i+1}.parquet")
    df_chunk.to_parquet(out_path, index=False)

    print(f"Saved {out_path} ({len(df_chunk)} rows)")


Saved /Users/bowiechuang/Documents/GitHub/ProjectNexus/backend/email_chunks/emails_part_1.parquet (100000 rows)
Saved /Users/bowiechuang/Documents/GitHub/ProjectNexus/backend/email_chunks/emails_part_2.parquet (100000 rows)
Saved /Users/bowiechuang/Documents/GitHub/ProjectNexus/backend/email_chunks/emails_part_3.parquet (100000 rows)
Saved /Users/bowiechuang/Documents/GitHub/ProjectNexus/backend/email_chunks/emails_part_4.parquet (100000 rows)
Saved /Users/bowiechuang/Documents/GitHub/ProjectNexus/backend/email_chunks/emails_part_5.parquet (100000 rows)
Saved /Users/bowiechuang/Documents/GitHub/ProjectNexus/backend/email_chunks/emails_part_6.parquet (17401 rows)
