# Training

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Load data
train_data = load_data('train.csv')
val_data = load_data('validation.csv')
test_data = load_data('test.csv')

models = {
        'Naive Bayes': MultinomialNB(),
        'Random Forest': RandomForestClassifier(),
        "SVC": SVC(kernel='sigmoid', class_weight='balanced')
    }

def fit_model(train_data = train_data, model_type='naive_bayes'):
    # Convert text to numerical features
    global vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    X_train = vectorizer.fit_transform(train_data['text'])
    
    # Convert labels to numerical values
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(train_data['label'])


    global models
    # Choose the model type
    
    models[model_type] = models[model_type].fit(X_train,y_train)
    
    
    return models[model_type], encoder

def score_model(model, vectorizer, encoder, data):
    # Convert text to numerical features
    X = vectorizer.transform(data['text'])
    
    y_true = encoder.transform(data['label'])
    
    y_pred = model.predict(X)
    
    
    accuracy = accuracy_score(y_true, y_pred)
    print("Accuracy:", accuracy)
    
    # Print classification report
    target_names = encoder.classes_.astype(str)  # Convert to string array
    print("Classification Report:")
    target_names=['Ham','Spam']
    print(classification_report(y_true, y_pred, target_names=target_names))

def evaluate_model(model, vectorizer, encoder, train_data, val_data):
    print("Train Data:")
    score_model(model, vectorizer, encoder, train_data )
    
    print("Validation Data:")
    score_model(model, vectorizer, encoder, val_data)


def validate_model(train_data, val_data, model_type='naive_bayes'):
    model, vectorizer, encoder = fit_model(train_data, model_type)
    evaluate_model(model, vectorizer, encoder, train_data, val_data)
    return model

def score_test_models(test_data, encoder):

    
    for name, model in models.items():
        print(f"\n{name}:")
        X_test = vectorizer.transform(test_data['text'])
        y_test = encoder.transform(test_data['label'])
        score_model(model, vectorizer, encoder, test_data)



def score_benchmark_models(val_data, encoder):

    
    for name, model in models.items():
        print(f"\n{name}:")
        model, encoder = fit_model(model_type = name)
        print(model)
        evaluate_model(model, vectorizer, encoder, train_data, val_data)
    

# Fit and evaluate models on validation data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['text'])
encoder = LabelEncoder()
y_train = encoder.fit_transform(train_data['label'])



In [12]:
score_benchmark_models(val_data, encoder)


Naive Bayes:
MultinomialNB()
Train Data:
Accuracy: 0.9388875031179845
Classification Report:
              precision    recall  f1-score   support

         Ham       0.93      1.00      0.96      3052
        Spam       1.00      0.75      0.85       957

    accuracy                           0.94      4009
   macro avg       0.96      0.87      0.91      4009
weighted avg       0.94      0.94      0.94      4009

Validation Data:
Accuracy: 0.9080325960419092
Classification Report:
              precision    recall  f1-score   support

         Ham       0.89      1.00      0.94       654
        Spam       1.00      0.61      0.76       205

    accuracy                           0.91       859
   macro avg       0.95      0.81      0.85       859
weighted avg       0.92      0.91      0.90       859


Random Forest:
RandomForestClassifier()
Train Data:
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

         Ham       1.00      1.00     

In [13]:
score_test_models(test_data, encoder)


Naive Bayes:
Accuracy: 0.8883720930232558
Classification Report:
              precision    recall  f1-score   support

         Ham       0.87      1.00      0.93       654
        Spam       1.00      0.53      0.70       206

    accuracy                           0.89       860
   macro avg       0.94      0.77      0.81       860
weighted avg       0.90      0.89      0.88       860


Random Forest:
Accuracy: 0.9825581395348837
Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       654
        Spam       0.98      0.94      0.96       206

    accuracy                           0.98       860
   macro avg       0.98      0.97      0.98       860
weighted avg       0.98      0.98      0.98       860


SVC:
Accuracy: 0.9918604651162791
Classification Report:
              precision    recall  f1-score   support

         Ham       1.00      0.99      0.99       654
        Spam       0.98      0.99      0.98  

SVC performs best in terms of accuracy, precision, and F1-score. 

The Random Forest algorithm achieves a recall of 1 for Ham (i.e., there are no misclassifications of Ham as Spam), indicating that it effectively identifies legitimate messages. However, it exhibits poor recall for Spam, implying that a significant proportion of Spam messages are incorrectly classified as Ham. 

Therefore, if the priority is to prevent spam at all costs, the SVC model should be chosen. On the other hand, if the primary concern is to avoid misclassifying any legitimate message as spam, then the Random Forest model may be preferred.