## Installations

In [None]:
!pip install pandas
!pip install scikit-plot

## Import Libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC
import scikitplot
from imblearn.over_sampling import SMOTE

## Read & Split csv file

In [None]:
email_texts = pd.read_csv("enter your csv")

In [None]:
train_email_texts, test_email_texts = train_test_split(email_texts, test_size=0.3, random_state=42)

train_email_texts.to_csv('train.csv', index=False)
test_email_texts.to_csv("test.csv", index=False) 

## Classification model train

In [None]:
#extract the text for both training and testing
X_train_text = train_email_texts["MESSAGE"]
X_test_text = test_email_texts["MESSAGE"]

#extract the category for both training and testing
Y_train = train_email_texts["CATEGORY"]
Y_test = test_email_texts["CATEGORY"]

## CountVectorizer

In [None]:
vectorizer_v1 = CountVectorizer(min_df=10, stop_words='english')
vectorizer_v1.fit(X_train_text)
X_train_v1 = vectorizer_v1.transform(X_train_text)
X_train_v1_dense=pd.DataFrame(X_train_v1.toarray(), columns=vectorizer_v1.get_feature_names_out())

In [None]:
models_list = list()

## Naive Bayes Model

In [None]:
nb_model = MultinomialNB() # A Naive Bayes Classifier.
nb_model.fit(X_train_v1_dense, Y_train)

models_list.append([nb_model, "nb_model"])

## Random Forest Model

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train_v1_dense, Y_train)

models_list.append([rf_model, "rf_model"])

## Smote oversampling

In [None]:
sm = SMOTE(random_state=42)
X_train_v1_dense_sm, Y_train_sm = sm.fit_resample(X_train_v1_dense, Y_train)

## Naive Bayes Model with SMOTE oversampling

In [None]:
nb_smote_model = MultinomialNB() # A Naive Bayes Classifier.
nb_smote_model.fit(X_train_v1_dense_sm, Y_train_sm)

models_list.append([nb_smote_model, "nb_smote_model"])

## Random Forest Model with SMOTE oversampling

In [None]:
rf_smote_model = RandomForestClassifier()
rf_smote_model.fit(X_train_v1_dense_sm, Y_train_sm)

models_list.append([rf_smote_model, "rf_smote_model"])

## Predictions

In [None]:
class_names=['spam','non spam']

In [None]:
print("Accuracy Scores")
for model, model_name in models_list:
    prediction_pipeline_v1 = make_pipeline(vectorizer_v1, model)
    predictions_v1 = prediction_pipeline_v1.predict(X_test_text)
    acc = accuracy_score(Y_test, predictions_v1)
    print(model_name, "->", acc)
    
    scikitplot.metrics.plot_confusion_matrix([class_names[i] for i in Y_test], # actual labels
                                    [class_names[i] for i in predictions_v1], # predicted labels 
                                    title="Confusion Matrix " + model_name, # title to tuse 
                                    cmap="Purples", # color palette to use 
                                    figsize=(5,5) # figure size 
                                    )