In [16]:
import pandas as pd
import csv
import re

# Load the dataset with the correct parameters for handling potential parsing issues
emails_df = pd.read_csv('emails.csv', quoting=csv.QUOTE_NONE, on_bad_lines='skip', escapechar="\\")

# Removing quotes from column names if they exist
emails_df.columns = emails_df.columns.str.replace('"', '')

# Handling Missing Values
emails_df.fillna('', inplace=True)

# Text Preprocessing Function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove email headers or unnecessary metadata (for demonstration, might need customization)
    text = re.sub(r'^[a-z]+:.*$', '', text)  # Remove lines that start with metadata-like patterns
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    # Remove special characters (customize based on the dataset and needs)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

# Apply text preprocessing to the 'message' column
emails_df['message'] = emails_df['message'].apply(preprocess_text)

# Display the first few rows of the cleaned dataframe
emails_df.head(14)


Unnamed: 0,file,message
0,"""allen-p/_sent_mail/1.""",messageid 187829811075855378110javamailevansthyme
1,Date: Mon,14 may 2001 163900 0700 pdt
2,From: phillip.allen@enron.com,
3,To: tim.belden@enron.com,
4,Subject:,
5,Mime-Version: 1.0,
6,Content-Type: text/plain; charset=us-ascii,
7,Content-Transfer-Encoding: 7bit,
8,X-From: Phillip K Allen,
9,X-To: Tim Belden <Tim Belden/Enron@EnronXGate>,


KeyError: 'file'

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Display plots inline in the Jupyter Notebook
%matplotlib inline



In [3]:
df = pd.read_csv('emails.csv', on_bad_lines='skip', quotechar='\x07')  # Using a non-standard character as quotechar

In [4]:
# Display the first few rows of the DataFrame
print(df.head())

                          "file"  \
0        "allen-p/_sent_mail/1."   
1                      Date: Mon   
2  From: phillip.allen@enron.com   
3       To: tim.belden@enron.com   
4                      Subject:    

                                           "message"  
0  "Message-ID: <18782981.1075855378110.JavaMail....  
1                   14 May 2001 16:39:00 -0700 (PDT)  
2                                                NaN  
3                                                NaN  
4                                                NaN  


In [5]:
print(df.describe())

                   "file" "message"
count             8178232   2387290
unique            1691401    632306
top     Mime-Version: 1.0          
freq               147232     25805


In [7]:
import pandas as pd

# Assuming df is your DataFrame after loading 'emails.csv'
# Recalculate chunk size and split the DataFrame
chunk_size = len(df) // 4
dfs = [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

# Ensure that the last chunk gets any remaining rows
if len(df) % 4 != 0:
    dfs[-2] = pd.concat([dfs[-2], dfs[-1]])
    dfs.pop(-1)

# Dynamically generate file names based on the number of chunks
file_names = [f"emails_part{i+1}.csv" for i in range(len(dfs))]

# Save each part to a new CSV file
for i, df_part in enumerate(dfs):
    file_path = f"/mnt/data/{file_names[i]}"  # Adjust path as needed
    df_part.to_csv(file_path, index=False)

# Return file paths for downloading
file_paths = [f"/mnt/data/{name}" for name in file_names]
file_paths


OSError: Cannot save file into a non-existent directory: '/mnt/data'

In [None]:
# Display the DataFrame's shape (number of rows and columns)
print(f'Shape of DataFrame: {df.shape}')

# Display a concise summary of the DataFrame, including the data types of each column
print(df.info())

In [None]:
# Display a concise summary of the DataFrame, including the data types of each column
print(df.info())

In [None]:
print(df.columns)


In [None]:
# Correcting the column name based on the actual column names in the DataFrame
df.dropna(subset=['"message"'], inplace=True)

In [None]:
import nltk
nltk.download('stopwords')


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Ensure nltk resources are available
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs, HTML tags, and symbols
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Assuming you've previously loaded the DataFrame and corrected the KeyError
# Correcting the column name based on the actual column names in the DataFrame
df.dropna(subset=['"message"'], inplace=True)

# Apply the cleaning function to the correctly named 'message' column
df['message_clean'] = df['"message"'].apply(clean_text)


In [None]:
missing_messages = df['"message"'].isnull().sum()
print(f'Number of missing messages: {missing_messages}')


In [None]:
# Count missing values in the 'message' column after preprocessing
missing_messages_after = df['"message"'].isnull().sum()
print(f'Number of missing messages after preprocessing: {missing_messages_after}')


In [None]:
# Display a sample of the cleaned messages
print(df[['"message"', 'message_clean']].sample(5))
