In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Function to fit a model on train data
def fit_model(train_data, model):
    x_train = train_data.drop('spam', axis=1)
    y_train = train_data['spam']

    model.fit(x_train, y_train)
    return model

In [3]:
# Function to score a model on given data
def score_model(data, model):
    x = data.drop('spam', axis=1)
    y = data['spam']
    
    predictions = model.predict(x)
    accuracy = accuracy_score(y, predictions)
    
    return accuracy

In [4]:
# Function to evaluate the model predictions
def evaluate_model(data, model):
    x = data.drop('spam', axis=1)
    y = data['spam']
    
    predictions = model.predict(x)
    report = classification_report(y, predictions)
    return report

In [5]:
# Function to validate the model
def validate_model(validation_data, model):
    return evaluate_model(validation_data, model)

In [6]:
# Load the data
train_set = pd.read_csv('data/train.csv')
validation_set = pd.read_csv('data/validation.csv')
test_set = pd.read_csv('data/test.csv')

# Fitting and scoring the models

## Support Vector Machine (SVM) model

In [17]:
# Fitting the SVC model on train data
svc_model = fit_model(train_set, SVC())

In [18]:
# Scoring the SVC model on train data
svc_accuracy = score_model(train_set, svc_model)
print(f"SVM accuracy on train data: {svc_accuracy}")
# Evaluate the model predictions
print(evaluate_model(train_set, svc_model))

SVM accuracy on train data: 0.9921431689218682
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3504
           1       0.99      0.97      0.98      1078

    accuracy                           0.99      4582
   macro avg       0.99      0.99      0.99      4582
weighted avg       0.99      0.99      0.99      4582


In [19]:
# Scoring the SVC model on test data
svc_accuracy = score_model(test_set, svc_model)
print(f"SVM accuracy on test data: {svc_accuracy}")
# Evaluate the model predictions
print(evaluate_model(test_set, svc_model))

SVM accuracy on test data: 0.9738219895287958
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       425
           1       0.99      0.91      0.95       148

    accuracy                           0.97       573
   macro avg       0.98      0.95      0.96       573
weighted avg       0.97      0.97      0.97       573


In [20]:
# Scoring the SVC model on validation data
svc_accuracy = score_model(validation_set, svc_model)
print(f"SVM accuracy on validation data: {svc_accuracy}")
# Evaluate the model predictions
print(evaluate_model(validation_set, svc_model))

SVM accuracy on validation data: 0.9720767888307156
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       431
           1       0.99      0.89      0.94       142

    accuracy                           0.97       573
   macro avg       0.98      0.95      0.96       573
weighted avg       0.97      0.97      0.97       573


## Naive Bayes model

In [21]:
# Fitting the Naive Bayes model on train data
naive_bayes_model = fit_model(train_set, MultinomialNB())  # Fitting the Naive Bayes model on train data

In [22]:
# Scoring the Naive Bayes model on train data
naive_bayes_accuracy = score_model(train_set, naive_bayes_model)
print(f"Naive Bayes accuracy on train data: {naive_bayes_accuracy}")
# Evaluate the model predictions
print(evaluate_model(train_set, naive_bayes_model))

Naive Bayes accuracy on train data: 0.9965080750763858
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3504
           1       0.99      1.00      0.99      1078

    accuracy                           1.00      4582
   macro avg       0.99      1.00      1.00      4582
weighted avg       1.00      1.00      1.00      4582


In [23]:
# Scoring the Naive Bayes model on test data
naive_bayes_accuracy = score_model(test_set, naive_bayes_model)
print(f"Naive Bayes accuracy on test data: {naive_bayes_accuracy}")
# Evaluate the model predictions
print(evaluate_model(test_set, naive_bayes_model)) 

Naive Bayes accuracy on test data: 0.9895287958115183
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       425
           1       0.97      0.99      0.98       148

    accuracy                           0.99       573
   macro avg       0.98      0.99      0.99       573
weighted avg       0.99      0.99      0.99       573


In [24]:
# Scoring the Naive Bayes model on validation data
naive_bayes_accuracy = score_model(validation_set, naive_bayes_model)
print(f"Naive Bayes accuracy on validation data: {naive_bayes_accuracy}")
# Evaluate the model predictions
print(evaluate_model(validation_set, naive_bayes_model))

Naive Bayes accuracy on validation data: 0.9930191972076788
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       431
           1       0.97      1.00      0.99       142

    accuracy                           0.99       573
   macro avg       0.99      1.00      0.99       573
weighted avg       0.99      0.99      0.99       573


## Random Forest model

In [25]:
# Fitting the RandomForest model on train data
random_forest_model = fit_model(train_set, RandomForestClassifier())  # Fitting the RandomForest model on train data

In [26]:
# Scoring the RandomForest model on train data
random_forest_accuracy = score_model(train_set, random_forest_model)
print(f"Random Forest accuracy on train data: {random_forest_accuracy}")
# Evaluate the model predictions
print(evaluate_model(train_set, random_forest_model))

Random Forest accuracy on train data: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3504
           1       1.00      1.00      1.00      1078

    accuracy                           1.00      4582
   macro avg       1.00      1.00      1.00      4582
weighted avg       1.00      1.00      1.00      4582


In [27]:
# Scoring the RandomForest model on test data
random_forest_accuracy = score_model(test_set, random_forest_model)
print(f"Random Forest accuracy on test data: {random_forest_accuracy}")
# Evaluate the model predictions
print(evaluate_model(test_set, random_forest_model))

Random Forest accuracy on test data: 0.9738219895287958
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       425
           1       1.00      0.90      0.95       148

    accuracy                           0.97       573
   macro avg       0.98      0.95      0.96       573
weighted avg       0.97      0.97      0.97       573


In [28]:
# Scoring the RandomForest model on validation data
random_forest_accuracy = score_model(validation_set, random_forest_model)
print(f"Random Forest accuracy on validation data: {random_forest_accuracy}")
# Evaluate the model predictions
print(evaluate_model(validation_set, random_forest_model))

Random Forest accuracy on validation data: 0.9755671902268761
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       431
           1       1.00      0.90      0.95       142

    accuracy                           0.98       573
   macro avg       0.98      0.95      0.97       573
weighted avg       0.98      0.98      0.98       573
