# 4. Classifying Emails as Spam Using Decision Trees

Importing the libraries

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

Loading the dataset

In [4]:
data = pd.read_csv('./datasets/email.csv')

In [5]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data Preprocessing

Encode target variable

In [7]:
label_encoder = LabelEncoder()
data['Category'] = label_encoder.fit_transform(data['Category'])

Transform the text data to TF-IDF features

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['Message'])
y = data['Category']

Split the dataset into training and testing sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision tree classifier

In [10]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Evaluate the model

Predict on the test set

In [11]:
y_pred = clf.predict(X_test)

Calculate evaluation metrics

In [12]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Precision: 0.94
Recall: 0.87
F1 Score: 0.90


Classification report

In [15]:
print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))

              precision    recall  f1-score   support

         ham       0.98      0.99      0.98       958
        spam       0.94      0.87      0.90       157

    accuracy                           0.97      1115
   macro avg       0.96      0.93      0.94      1115
weighted avg       0.97      0.97      0.97      1115

