In [1]:
# Download data

!unzip -q data.zip
# --2024-05-02 14:35:03--  https://hr-projects-assets-

--2024-05-03 21:51:26--  https://hr-projects-assets-prod.s3.amazonaws.com/27jm51388tr/c25eedc3c491481b91107b71c114a8e1/data.zip
Resolving hr-projects-assets-prod.s3.amazonaws.com (hr-projects-assets-prod.s3.amazonaws.com)... 54.231.172.241, 52.216.110.155, 3.5.28.103, ...
Connecting to hr-projects-assets-prod.s3.amazonaws.com (hr-projects-assets-prod.s3.amazonaws.com)|54.231.172.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 195920296 (187M) [binary/octet-stream]
Saving to: ‘data.zip’


2024-05-03 21:51:31 (39.5 MB/s) - ‘data.zip’ saved [195920296/195920296]



In [2]:
import os
import pandas as pd

print("Train emails: %s" % len(os.listdir("data/train/")))
print("Test emails: %s" % len(os.listdir("data/test/")))

# Jupyter might crash if you go inside the data folders
# we suggest loading some files to check their contents

train_data_folder = "data/train/"
sample_filename = os.listdir(train_data_folder)[0]
sample_file_path = os.path.join(train_data_folder, sample_filename)

with open(sample_file_path, "rb") as fp:
    contents = fp.read()
    print(contents)

# If you'd like to install packages that aren't installed by default, list them here.
# This will ensure your notebook has all the dependencies and works everywhere
import sys
# !{sys.executable} -m pip install sklearn

!{sys.executable} -m pip install scikit-learn

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt

# Read the data
train_df = pd.read_csv('train.csv')

# Build and train the model
# Since the provided data doesn't contain the actual content of emails,
# we need to use other techniques for text classification such as TF-IDF or word embeddings.
# For simplicity, let's start with a TF-IDF approach and use a simple classifier like Logistic Regression.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Preprocessing
vectorizer = TfidfVectorizer(max_features=1000)  # Limiting features to avoid memory issues

# Model
classifier = LogisticRegression()

# Pipeline
pipeline = Pipeline([('vectorizer', vectorizer),
                     ('classifier', classifier)])

# Train the model
pipeline.fit(train_df['email'], train_df['label'])

# Test using the validation set (or part of it) and your own metrics / plots
# Since we don't have a validation set, let's split the training data into train and validation sets.
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_df['email'], train_df['label'], test_size=0.2, random_state=42)

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the validation set
val_predictions = pipeline.predict(X_val)

# Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Calculate evaluation metrics
accuracy = accuracy_score(y_val, val_predictions)
precision = precision_score(y_val, val_predictions, zero_division=1)  # Setting zero_division to 1 to handle warning
recall = recall_score(y_val, val_predictions)
f1 = f1_score(y_val, val_predictions)
conf_matrix = confusion_matrix(y_val, val_predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

# Additional code: use as many cells as needed.
# The ones provided here are only an exemplification of the workflow and they might not be enough / representative for your thought process.


Train emails: 61571
Test emails: 13007
b'From tbarrettknohv@eugegroove.com  Tue Apr 10 12:03:33 2007\nReturn-Path: <tbarrettknohv@eugegroove.com>\nReceived: from plg2.math.uwaterloo.ca (plg2.math.uwaterloo.ca [129.97.186.80])\n\tby speedy.uwaterloo.ca (8.12.8/8.12.5) with ESMTP id l3AG3X0I031389\n\tfor <theplg@speedy.uwaterloo.ca>; Tue, 10 Apr 2007 12:03:33 -0400\nReceived: from eugegroove.com ([58.42.145.189])\n\tby plg2.math.uwaterloo.ca (8.13.8/8.13.8) with SMTP id l3AG2Uqt000561;\n\tTue, 10 Apr 2007 12:02:35 -0400 (EDT)\nMessage-ID: <ccea01c77bcb$d25f31a0$82d8d6bb@tbarrettknohv>\nFrom: "Dewey Palmer" <tbarrettknohv@eugegroove.com>\nTo: "Valerie" <dmason@plg2.math.uwaterloo.ca>\nCc: "Ulysses" <migod@plg2.math.uwaterloo.ca>,\n   "Gregorio Long" <holt@plg2.math.uwaterloo.ca>,\n   "Chantal" <dsvetinovic@plg2.math.uwaterloo.ca>,\n   "Tennie Miller" <y5guo@plg2.math.uwaterloo.ca>,\n   "Juanita Baker" <the00@plg2.math.uwaterloo.ca>,\n   "Lili" <adtrevors@plg2.math.uwaterloo.ca>,\n   "Carm

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
# Predict on the test set
test_data_folder = "data/test/"
test_filenames = os.listdir(test_data_folder)
test_file_paths = [os.path.join(test_data_folder, filename) for filename in test_filenames]

test_predictions = pipeline.predict(test_file_paths)

# Create DataFrame for test predictions
submission_df = pd.DataFrame({'email': test_filenames, 'label': test_predictions})

# Ensure all emails are included in the submission DataFrame with correct filenames
for filename in os.listdir("data/test/"):
    if filename not in submission_df['email'].values:
        submission_df = submission_df.append({'email': filename, 'label': 0}, ignore_index=True)

# Ensure the order of filenames and predictions is correct
submission_df.sort_values(by='email', inplace=True)

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

# Verify the submission file
import os
import pandas as pd

sub_df = pd.read_csv('submission.csv')
assert sub_df.columns.tolist() == ["email", "label"]
assert sub_df["label"].dtype == int
assert len(sub_df) == len(os.listdir("data/test/"))
print("saved")


In [None]:
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import re

# Load the data
train_data = pd.read_csv("train.csv")

# Display the first few rows of the dataset
train_data.head()
# Function to clean and preprocess text data
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Apply the clean_text function to the 'email' column
train_data['clean_email'] = train_data['email'].apply(clean_text)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['clean_email'], train_data['label'], test_size=0.2, random_state=42)

# Convert text data to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
val_predictions = model.predict(X_val_tfidf)

# Calculate F1 score
f1 = f1_score(y_val, val_predictions)

print("F1 Score:", f1)
# Load test data
test_data = pd.read_csv("test.csv")

# Apply preprocessing to test data
test_data['clean_email'] = test_data['email'].apply(clean_text)

# Convert text data to numerical features using TF-IDF
X_test_tfidf = tfidf_vectorizer.transform(test_data['clean_email'])

# Make predictions on test data
test_predictions = model.predict(X_test_tfidf)

# Create submission file
submission_df = pd.DataFrame({'email': test_data['email'], 'label': test_predictions})
submission_df.to_csv("submission.csv", index=False)


In [4]:
import os
import pandas as pd
import mailparser
from bs4 import BeautifulSoup

# Path to the train folder
train_folder = "data/train/"

# List to store email contents and labels
emails = []
labels = []

# Iterate through each email file in the train folder
for filename in os.listdir(train_folder):
    with open(os.path.join(train_folder, filename), "rb") as f:
        email_bytes = f.read()
        # Parse the email using mailparser
        parsed_email = mailparser.parse_from_bytes(email_bytes)
        # Extract email content
        content = parsed_email.body
        # If the email has HTML content, extract text from it
        if parsed_email.content_type == "text/html":
            soup = BeautifulSoup(content, "html.parser")
            content = soup.get_text()
        # Append email content to the list
        emails.append(content)
        # Extract label from the filename (spam or ham)
        label = 1 if "spam" in filename else 0
        labels.append(label)

# Create a DataFrame from the extracted data
train_data = pd.DataFrame({"email": emails, "label": labels})

# Display the first few rows of the DataFrame
train_data.head()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['email'], train_data['label'], test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation data
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
val_predictions = model.predict(X_val_tfidf)

# Calculate F1 score
f1 = f1_score(y_val, val_predictions)
print("Validation F1 Score:", f1)

# Path to the test folder
test_folder = "data/test/"

# List to store email contents
test_emails = []

# Iterate through each email file in the test folder
for filename in os.listdir(test_folder):
    with open(os.path.join(test_folder, filename), "rb") as f:
        email_bytes = f.read()
        # Parse the email using mailparser
        parsed_email = mailparser.parse_from_bytes(email_bytes)
        # Extract email content
        content = parsed_email.body
        # If the email has HTML content, extract text from it
        if parsed_email.content_type == "text/html":
            soup = BeautifulSoup(content, "html.parser")
            content = soup.get_text()
        # Append email content to the list
        test_emails.append(content)

# Convert test data to TF-IDF features
X_test_tfidf = tfidf_vectorizer.transform(test_emails)

# Make predictions on test data
test_predictions = model.predict(X_test_tfidf)

# Create submission DataFrame
submission_df = pd.DataFrame({"email": os.listdir(test_folder), "label": test_predictions})

# Save submission file
submission_df.to_csv("submission.csv", index=False)



ModuleNotFoundError: No module named 'mailparser'