In [1]:
# prompt: Process the emails.csv csv into a table with pandas and rapids-cudf and the regex pattern
# "Message-ID:\s(?P<message_id>.*)\n
# Date:\s(?P<date_id>.*)\n
# From:\s(?P<from>.*)\n
# To:\s(?P<to>.*)\n
# Subject:\s?(.*)\n
# .*
# X-From:\s?(?P<X_from>.*?)\n
# .*X-FileName:.*?$\n\n
# (?P<body>.*)"gmxs

#!pip install --extra-index-url=https://pypi.nvidia.com cudf-cu11
import cudf
import pandas as pd
import re

%load_ext cudf.pandas
emails_path = r'/content/drive/MyDrive/Colab Notebooks/emails.csv'
gdrive_path = r'/content/drive/MyDrive/Colab Notebooks/'

from os.path import exists

In [2]:
if exists(gdrive_path+r'emails_df.pickle'):
  emails_df = pd.read_pickle(gdrive_path+r'emails_df.pickle')
else:
  emails_df = pd.read_csv(emails_path, quoting=2, on_bad_lines='skip')
  emails_df.to_pickle(gdrive_path+'emails_df.pickle')
                             # quoting=3 tells the parser to ignore quotes
# error_bad_lines=False will skip lines that cause errors
emails_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   file     517401 non-null  object
 1   message  517401 non-null  object
dtypes: object(2)
memory usage: 7.9+ MB


In [3]:
emails_df.head(3)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...


In [4]:
sample_data_df = emails_df.head(100_000)

In [5]:

# Define the regex pattern
pattern = r"""Message-ID:\s(?P<message_id>.*)\n
Date:\s(?P<date_id>.*)\n
From:\s(?P<from>.*)\n
To:\s(?P<to>.*)\n
Subject:\s?(?P<subject>.*?)\n.*
Mime-Version:.*
X-From:\s?(?P<X_from>.*?)\n
.*X-FileName:.*?$\n\n
(?P<body>.*)"""

extract_pickle = r'extract_df.pickle'

if exists(gdrive_path+extract_pickle):
  extracted_data = pd.read_pickle(gdrive_path+extract_pickle)
else:
  extracted_data = sample_data_df['message'].str.extract(pattern, flags=re.VERBOSE|re.MULTILINE|re.DOTALL)
  extracted_data.to_pickle(gdrive_path+extract_pickle)


In [6]:
extracted_data.info()
extracted_data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   message_id  94999 non-null  object
 1   date_id     94999 non-null  object
 2   from        94999 non-null  object
 3   to          94999 non-null  object
 4   subject     94999 non-null  object
 5   X_from      94999 non-null  object
 6   body        94999 non-null  object
dtypes: object(7)
memory usage: 5.3+ MB


Unnamed: 0,message_id,date_id,from,to,subject,X_from,body
0,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,Phillip K Allen,Here is our forecast\n\n
1,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,Phillip K Allen,Traveling to have a business meeting takes the...
2,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,Phillip K Allen,test successful. way to go!!!
3,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,Phillip K Allen,"Randy,\n\n Can you send me a schedule of the s..."
4,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,Phillip K Allen,Let's shoot for Tuesday at 11:45.
5,<30965995.1075863688265.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 04:17:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,Phillip K Allen,"Greg,\n\n How about either next Tuesday or Thu..."
6,<16254169.1075863688286.JavaMail.evans@thyme>,"Tue, 22 Aug 2000 07:44:00 -0700 (PDT)",phillip.allen@enron.com,"david.l.johnson@enron.com, john.shafer@enron.com",,Phillip K Allen,Please cc the following distribution list with...
7,<17189699.1075863688308.JavaMail.evans@thyme>,"Fri, 14 Jul 2000 06:59:00 -0700 (PDT)",phillip.allen@enron.com,joyce.teixeira@enron.com,Re: PRC review - phone calls,Phillip K Allen,any morning between 10 and 11:30
8,<20641191.1075855687472.JavaMail.evans@thyme>,"Tue, 17 Oct 2000 02:26:00 -0700 (PDT)",phillip.allen@enron.com,mark.scott@enron.com,Re: High Speed Internet Access,Phillip K Allen,1. login: pallen pw: ke9davis\n\n I don't thi...
9,<30795301.1075855687494.JavaMail.evans@thyme>,"Mon, 16 Oct 2000 06:44:00 -0700 (PDT)",phillip.allen@enron.com,zimam@enron.com,FW: fixed forward or other Collar floor gas pr...,Phillip K Allen,---------------------- Forwarded by Phillip K ...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')
extracted_data_subset = ["from","subject","body"]

extracted_data['combined_text'] = extracted_data[extracted_data_subset].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Vectorize the combined text
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(extracted_data['combined_text'])



In [9]:
# Cluster the data
num_clusters = 3  # This can be tuned
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
#IDC WHY THIS PART ISN'T WORKING
extracted_data['cluster'] = kmeans.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# Visualize the clusters
pca = PCA(n_components=2)
scatter_plot_points = pca.fit_transform(X.toarray())

colors = ["r", "b", "c"]
x_axis = [o[0] for o in scatter_plot_points]
y_axis = [o[1] for o in scatter_plot_points]
fig, ax = plt.subplots(figsize=(20,10))

ax.scatter(x_axis, y_axis, c=[colors[d] for d in kmeans.labels_])

for i, txt in enumerate(extracted_data[extracted_data_subset]):
    ax.annotate(txt, (x_axis[i], y_axis[i]))

plt.show()