In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re

import numpy as np  # linear algebra
import polars as pl  # data processing, CSV file I/O (e.g. pl.read_csv)
from dateutil import parser
from IPython.display import display

from authorship_tool.util.path_util import DatasetPaths

Project root: /workspaces/shap-authorship-analysis-demo
Path: data/john_blake_2023/wordLists/adjectives/participle/adjectivesPastParticiple.csv
Path: data/john_blake_2023/wordLists/adjectives/participle/adjectivesPresentParticiple.csv
Path: data/john_blake_2023/wordLists/adjectives/limit/limit_adjectives.csv
Path: data/john_blake_2023/wordLists/adjectives/extraposition/adjectives_extraposition.csv
Path: data/liyanage_vijini_2022/Dataset/FullyGenerated
Path: data/liyanage_vijini_2022/Dataset/Hybrid_AbstractDataset
Path: data/uoa-thesis-2014-2017
Path: data/enron-corpus/emails.csv
Path: data/enron-corpus/emails_cleaned.csv
Path: out/text_data
Path: out/processed_text
Path: out/dataset
Path: out/lgbm/model
Path: out/shap/figure


In [3]:
datasetPaths = DatasetPaths()

# chunk = pd.read_csv(datasetPaths.enron_dataset, chunksize=5000)
# data = next(chunk)
data = pl.read_csv(datasetPaths.enron_dataset).with_row_index(name="index")

In [4]:
print(data.get_column("message")[2])

Message-ID: <24216240.1075855687451.JavaMail.evans@thyme>
Date: Wed, 18 Oct 2000 03:00:00 -0700 (PDT)
From: phillip.allen@enron.com
To: leah.arsdall@enron.com
Subject: Re: test
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Leah Van Arsdall
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail
X-Origin: Allen-P
X-FileName: pallen.nsf

test successful.  way to go!!!


In [5]:
display(data.head())

index,file,message
u32,str,str
0,"""allen-p/_sent_mail/1.""","""Message-ID: <18782981.10758553…"
1,"""allen-p/_sent_mail/10.""","""Message-ID: <15464986.10758553…"
2,"""allen-p/_sent_mail/100.""","""Message-ID: <24216240.10758556…"
3,"""allen-p/_sent_mail/1000.""","""Message-ID: <13505866.10758636…"
4,"""allen-p/_sent_mail/1001.""","""Message-ID: <30922949.10758636…"


In [6]:
def get_text(series: pl.Series, row_num_slicer: int) -> list[str]:
    """returns a Series with text sliced from a list split from each message. Row_num_slicer
    tells function where to slice split text to find only the body of the message."""
    result = []
    for row, message in enumerate(series):
        if row % 1000 == 0:
            print(f"{row / len(series) * 100:.1f}%...")
        message_words = message.split("\n")
        del message_words[:row_num_slicer]
        result.append("\n".join(message_words))
    return result


def get_row(series: pl.Series, row_num: int) -> list[str]:
    """returns a single row split out from each message. Row_num is the index of the specific
    row that you want the function to return."""
    result = []
    for row, message in enumerate(series):
        if row % 1000 == 0:
            print(f"{row / len(series) * 100:.1f}%...")
        message_words = message.split("\n")
        message_words = message_words[row_num]
        result.append(message_words)
    return result


def get_address(
    series: pl.Series,
) -> tuple[list[str]]:
    """returns a specified email address from each row in a Series"""
    address = re.compile("[\w\.-]+@[\w\.-]+\.\w+")
    result1 = []
    for i in range(len(series)):
        if i % 1000 == 0:
            print(f"{i / len(series) * 100:.1f}%...")
        for message in series:
            correspondents = re.findall(address, message)
            result1.append(correspondents[0])
    return result1


def standard_format(
    df: pl.DataFrame,
    series: pl.Series,
    string: str,
) -> pl.DataFrame:
    """Drops rows containing messages without some specified value in the expected locations.
    Returns original dataframe without these values. Don't forget to reindex after doing this!!!"""
    rows = []
    for row, message in enumerate(series):
        if row % 1000 == 0:
            print(f"{row / len(series) * 100:.1f}%...")
        message_words = message.split("\n")[:20]
        if not any(line.startswith(string) for line in message_words):
            rows.append(row)

    df = df.filter(~pl.col("index").is_in(rows))
    return df

  address = re.compile("[\w\.-]+@[\w\.-]+\.\w+")


In [7]:
x = len(data)
headers = ["From: ", "Subject: "]

print("---\nRemoving emails without headers...\n---")
for i, v in enumerate(headers):
    print(f"---\nChecking `{v}`...\n---")
    data = standard_format(data, data.get_column("message"), v)
    print("Done✅")
data = data.drop("index").with_row_index(name="index")
print(
    "Got rid of {} useless emails! That's {}% of the total number of messages in this dataset.".format(
        x - len(data),
        np.round(((x - len(data)) / x) * 100, decimals=2),
    )
)

---
Removing emails without headers...
---
---
Checking `From: `...
---
0.0%...
0.2%...
0.4%...
0.6%...
0.8%...
1.0%...
1.2%...
1.4%...
1.5%...
1.7%...
1.9%...
2.1%...
2.3%...
2.5%...
2.7%...
2.9%...
3.1%...
3.3%...
3.5%...
3.7%...


3.9%...
4.1%...
4.3%...
4.4%...
4.6%...
4.8%...
5.0%...
5.2%...
5.4%...
5.6%...
5.8%...
6.0%...
6.2%...
6.4%...
6.6%...
6.8%...
7.0%...
7.2%...
7.3%...
7.5%...
7.7%...
7.9%...
8.1%...
8.3%...
8.5%...
8.7%...
8.9%...
9.1%...
9.3%...
9.5%...
9.7%...
9.9%...
10.1%...
10.2%...
10.4%...
10.6%...
10.8%...
11.0%...
11.2%...
11.4%...
11.6%...
11.8%...
12.0%...
12.2%...
12.4%...
12.6%...
12.8%...
12.9%...
13.1%...
13.3%...
13.5%...
13.7%...
13.9%...
14.1%...
14.3%...
14.5%...
14.7%...
14.9%...
15.1%...
15.3%...
15.5%...
15.7%...
15.8%...
16.0%...
16.2%...
16.4%...
16.6%...
16.8%...
17.0%...
17.2%...
17.4%...
17.6%...
17.8%...
18.0%...
18.2%...
18.4%...
18.6%...
18.7%...
18.9%...
19.1%...
19.3%...
19.5%...
19.7%...
19.9%...
20.1%...
20.3%...
20.5%...
20.7%...
20.9%...
21.1%...
21.3%...
21.5%...
21.6%...
21.8%...
22.0%...
22.2%...
22.4%...
22.6%...
22.8%...
23.0%...
23.2%...
23.4%...
23.6%...
23.8%...
24.0%...
24.2%...
24.4%...
24.5%...
24.7%...
24.9%...
25.1%...
25.3%...
25.5%...
25.7%...
25.9%.

In [8]:
print("---\nExtracting text from message...\n---")
texts = pl.Series("text", get_text(data.get_column("message"), 15))
print("Done✅")

---
Extracting text from message...
---
0.0%...
0.2%...
0.4%...
0.6%...
0.8%...
1.0%...
1.2%...
1.4%...
1.6%...
1.8%...
2.0%...
2.2%...
2.4%...
2.6%...
2.8%...
3.0%...
3.2%...
3.4%...
3.6%...
3.8%...
4.0%...
4.2%...
4.4%...
4.6%...
4.8%...
5.0%...
5.2%...
5.4%...
5.6%...
5.8%...
6.0%...
6.2%...
6.4%...
6.6%...
6.8%...
7.0%...
7.2%...
7.4%...
7.6%...
7.8%...
8.0%...
8.2%...
8.4%...
8.6%...
8.8%...
9.0%...
9.2%...
9.4%...
9.6%...
9.8%...
10.0%...
10.2%...
10.4%...
10.6%...
10.8%...
11.0%...
11.2%...
11.4%...
11.6%...
11.8%...
12.0%...
12.2%...
12.4%...
12.6%...
12.8%...
13.0%...
13.2%...
13.4%...
13.6%...
13.8%...
14.0%...
14.2%...
14.4%...
14.6%...
14.8%...
15.0%...
15.2%...
15.4%...
15.6%...
15.8%...
16.0%...
16.2%...
16.4%...
16.6%...
16.8%...
17.0%...
17.2%...
17.4%...
17.6%...
17.8%...
18.0%...
18.2%...
18.4%...
18.6%...
18.8%...
19.0%...
19.2%...
19.4%...
19.6%...
19.8%...
20.0%...
20.2%...
20.4%...
20.6%...
20.8%...
21.0%...
21.2%...
21.4%...
21.6%...
21.8%...
22.0%...
22.2%...
22

In [9]:
print("---\nExtracting date from message...\n---")
date_rows = pl.Series("date", get_row(data.get_column("message"), 1))
print("Done✅")

---
Extracting date from message...
---
0.0%...
0.2%...
0.4%...
0.6%...
0.8%...
1.0%...
1.2%...
1.4%...
1.6%...
1.8%...
2.0%...
2.2%...
2.4%...
2.6%...
2.8%...
3.0%...
3.2%...
3.4%...
3.6%...
3.8%...
4.0%...
4.2%...
4.4%...
4.6%...
4.8%...
5.0%...
5.2%...
5.4%...
5.6%...
5.8%...
6.0%...
6.2%...
6.4%...
6.6%...
6.8%...
7.0%...
7.2%...
7.4%...
7.6%...
7.8%...
8.0%...
8.2%...
8.4%...
8.6%...
8.8%...
9.0%...
9.2%...
9.4%...
9.6%...
9.8%...
10.0%...
10.2%...
10.4%...
10.6%...
10.8%...
11.0%...
11.2%...
11.4%...
11.6%...
11.8%...
12.0%...
12.2%...
12.4%...
12.6%...
12.8%...
13.0%...
13.2%...
13.4%...
13.6%...
13.8%...
14.0%...
14.2%...
14.4%...
14.6%...
14.8%...
15.0%...
15.2%...
15.4%...
15.6%...
15.8%...
16.0%...
16.2%...
16.4%...
16.6%...
16.8%...
17.0%...
17.2%...
17.4%...
17.6%...
17.8%...
18.0%...
18.2%...
18.4%...
18.6%...
18.8%...
19.0%...
19.2%...
19.4%...
19.6%...
19.8%...
20.0%...
20.2%...
20.4%...
20.6%...
20.8%...
21.0%...
21.2%...
21.4%...
21.6%...
21.8%...
22.0%...
22.2%...
22

In [10]:
print("---\nExtracting senders from message...\n---")
sender_rows = pl.Series("sender", get_row(data.get_column("message"), 2))
print("Done✅")

---
Extracting senders from message...
---
0.0%...
0.2%...
0.4%...
0.6%...
0.8%...
1.0%...
1.2%...
1.4%...
1.6%...
1.8%...
2.0%...
2.2%...
2.4%...
2.6%...
2.8%...
3.0%...
3.2%...
3.4%...
3.6%...
3.8%...
4.0%...
4.2%...
4.4%...
4.6%...
4.8%...
5.0%...
5.2%...
5.4%...
5.6%...
5.8%...
6.0%...
6.2%...
6.4%...
6.6%...
6.8%...
7.0%...
7.2%...
7.4%...
7.6%...
7.8%...
8.0%...
8.2%...
8.4%...
8.6%...
8.8%...
9.0%...
9.2%...
9.4%...
9.6%...
9.8%...
10.0%...
10.2%...
10.4%...
10.6%...
10.8%...
11.0%...
11.2%...
11.4%...
11.6%...
11.8%...
12.0%...
12.2%...
12.4%...
12.6%...
12.8%...
13.0%...
13.2%...
13.4%...
13.6%...
13.8%...
14.0%...
14.2%...
14.4%...
14.6%...
14.8%...
15.0%...
15.2%...
15.4%...
15.6%...
15.8%...
16.0%...
16.2%...
16.4%...
16.6%...
16.8%...
17.0%...
17.2%...
17.4%...
17.6%...
17.8%...
18.0%...
18.2%...
18.4%...
18.6%...
18.8%...
19.0%...
19.2%...
19.4%...
19.6%...
19.8%...
20.0%...
20.2%...
20.4%...
20.6%...
20.8%...
21.0%...
21.2%...
21.4%...
21.6%...
21.8%...
22.0%...
22.2%...

In [11]:
print("---\nExtracting subject from message...\n---")
subject_rows = pl.Series("subject", get_row(data.get_column("message"), 4))
print("Done✅")

---
Extracting subject from message...
---
0.0%...
0.2%...
0.4%...
0.6%...
0.8%...
1.0%...
1.2%...
1.4%...
1.6%...
1.8%...
2.0%...
2.2%...
2.4%...
2.6%...
2.8%...
3.0%...
3.2%...
3.4%...
3.6%...
3.8%...
4.0%...
4.2%...
4.4%...
4.6%...
4.8%...
5.0%...
5.2%...
5.4%...
5.6%...
5.8%...
6.0%...
6.2%...
6.4%...
6.6%...
6.8%...
7.0%...
7.2%...
7.4%...
7.6%...
7.8%...
8.0%...
8.2%...
8.4%...
8.6%...
8.8%...
9.0%...
9.2%...
9.4%...
9.6%...
9.8%...
10.0%...
10.2%...
10.4%...
10.6%...
10.8%...
11.0%...
11.2%...
11.4%...
11.6%...
11.8%...
12.0%...
12.2%...
12.4%...
12.6%...
12.8%...
13.0%...
13.2%...
13.4%...
13.6%...
13.8%...
14.0%...
14.2%...
14.4%...
14.6%...
14.8%...
15.0%...
15.2%...
15.4%...
15.6%...
15.8%...
16.0%...
16.2%...
16.4%...
16.6%...
16.8%...
17.0%...
17.2%...
17.4%...
17.6%...
17.8%...
18.0%...
18.2%...
18.4%...
18.6%...
18.8%...
19.0%...
19.2%...
19.4%...
19.6%...
19.8%...
20.0%...
20.2%...
20.4%...
20.6%...
20.8%...
21.0%...
21.2%...
21.4%...
21.6%...
21.8%...
22.0%...
22.2%...

In [12]:
print("---\nConverting date to datetime object... (This takes a while)\n---")

dates_str = date_rows.str.strip_prefix("Date: ")
datetime_objects = [parser.parse(date) for date in dates_str]
dates = pl.Series("date", datetime_objects)

print("Done✅")

---
Converting date to datetime object...
---


Done✅


In [13]:
print("---\nStripping headers from text...\n---")

subjects = subject_rows.str.strip_prefix("Subject: ")
senders = sender_rows.str.strip_prefix("From: ")
print("Done✅")

---
Stripping headers from text...
---
Done✅


In [14]:
print("---\nFinalizing dataset...\n---")

data_cleaned = data.with_columns([dates, senders, subjects, texts]).drop(
    ["file", "message"]
)
print("Done✅")

---
Finalizing dataset...
---
Done✅


In [15]:
display(data_cleaned.head())

index,date,sender,subject,text
u32,"datetime[μs, UTC]",str,str,str
0,2001-05-14 23:39:00 UTC,"""phillip.allen@enron.com""","""""",""" Here is our forecast  """
1,2001-05-04 20:51:00 UTC,"""phillip.allen@enron.com""","""Re:""",""" Traveling to have a business …"
2,2000-10-18 10:00:00 UTC,"""phillip.allen@enron.com""","""Re: test""",""" test successful. way to go!!…"
3,2000-10-23 13:13:00 UTC,"""phillip.allen@enron.com""","""""",""" Randy,  Can you send me a sc…"
4,2000-08-31 12:07:00 UTC,"""phillip.allen@enron.com""","""Re: Hello""",""" Let's shoot for Tuesday at 11…"


In [16]:
data_cleaned.write_csv(datasetPaths.enron_dataset_cleaned)

: 