In [51]:
import pandas as pd

df = pd.read_csv("/kaggle/input/multiclass-email/final_combined.csv")
df = df.drop_duplicates(subset=["email"]).reset_index(drop=True)
df

Unnamed: 0,email,category
0,Congratulations! You've won a $1000 Walmart gi...,spam
1,Join us for a special event this weekend!,social
2,Limited time offer: 50% off your next purchase!,promotional
3,"Hey, just checking in to see how you're doing.",personal
4,Your bank statement is ready for review.,finance
...,...,...
2998,Please confirm your email address to continue ...,important
2999,Your scheduled payment is due in 2 days. Check...,important
3000,Your account requires immediate attention. Ple...,important
3001,Your recent purchase receipt is available. Che...,finance


In [52]:
def remove_subject_prefix(text):
    # Check if the text starts with "Subject:" (case-insensitive)
    if text.lower().startswith("subject:"):
        # Remove the prefix and strip leading whitespace
        return text[8:].lstrip()  # 8 is the length of "Subject:"
    return text

df["email"] = df["email"].apply(remove_subject_prefix) 

In [53]:
df["email"] = df["email"].apply(lambda x: x.strip())
df['category'] = df["category"].apply(lambda x: x.strip())

df["category"].unique()


array(['spam', 'social', 'promotional', 'personal', 'finance',
       'important'], dtype=object)

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Initialize CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

# Convert text to a matrix of token counts
X = vectorizer.fit_transform(df['email'])

# Labels
y = df['category']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=75)


In [55]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier
model = DecisionTreeClassifier()

# Train the model
model.fit(X_train, y_train)


In [56]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
train_y_pred = model.predict(X_train)
y_pred = model.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, train_y_pred)
test_accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Train Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}')
print('Classification Report:')
print(report)


Train Accuracy: 1.00
Test Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

     finance       0.85      0.79      0.82       105
   important       0.79      0.72      0.75       107
    personal       0.66      0.83      0.74        72
 promotional       0.79      0.90      0.84       120
      social       0.82      0.77      0.80       102
        spam       0.84      0.73      0.78        95

    accuracy                           0.79       601
   macro avg       0.79      0.79      0.79       601
weighted avg       0.80      0.79      0.79       601



In [57]:
# Example of new text data
new_texts = ["You have won a free gift card. Click here to claim!",
             "I am stuck in Africa and I need your help.", 
             "Kindly login and reclaim your account."]

# Transform the new texts
new_X = vectorizer.transform(new_texts)

# Predict the class labels
predictions = model.predict(new_X)

# Display the predictions
for text, prediction in zip(new_texts, predictions):
    print(f'Text: "{text}" => Predicted Class: {prediction}')


Text: "You have won a free gift card. Click here to claim!" => Predicted Class: spam
Text: "I am stuck in Africa and I need your help." => Predicted Class: personal
Text: "Kindly login and reclaim your account." => Predicted Class: important
