# MLP

In [7]:
import os
import joblib
import json
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

from sklearn.neural_network import MLPClassifier

In [2]:
base_dir = os.getcwd()
datafile = os.path.join(base_dir, "data", "data_cleaned.csv")

In [4]:
df = pd.read_csv(datafile, index_col=None)
df.head()

Unnamed: 0,label,tokens
0,0,"[""'hpl"", 'nom', 'may', 'see', 'attached', 'fil..."
1,0,"[""'nom"", 'actual', 'vols', 'th', 'forwarded', ..."
2,0,"[""'enron"", 'actuals', 'march', 'april', 'estim..."
3,0,"[""'hpl"", 'nom', 'may', 'see', 'attached', 'fil..."
4,0,"[""'hpl"", 'nom', 'june', 'see', 'attached', 'fi..."


In [5]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=33)
x_train = train_df['tokens']
x_test = test_df['tokens']
y_train = train_df['label']
y_test = test_df['label']

## Train and grid-search MLP parameters using BOW

In [6]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_features=100000)
x_train_bow = vectorizer.fit_transform(x_train)
x_test_bow  = vectorizer.transform(x_test)

In [7]:
param_grid = {
    'hidden_layer_sizes': [(128,), (128, 64), (256, 128)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant'] #, 'adaptive']
}

In [8]:
mlp = MLPClassifier(max_iter=200, random_state=17)
grid = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid.fit(x_train_bow, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END activation=relu, hidden_layer_sizes=(128,), learning_rate=constant, solver=adam; total time=39.6min
[CV] END activation=relu, hidden_layer_sizes=(128,), learning_rate=constant, solver=adam; total time=43.6min
[CV] END activation=relu, hidden_layer_sizes=(128,), learning_rate=constant, solver=adam; total time=47.9min
[CV] END activation=relu, hidden_layer_sizes=(128,), learning_rate=constant, solver=adam; total time=58.5min
[CV] END activation=relu, hidden_layer_sizes=(128,), learning_rate=constant, solver=adam; total time=68.2min
[CV] END activation=relu, hidden_layer_sizes=(128, 64), learning_rate=constant, solver=adam; total time=33.4min
[CV] END activation=relu, hidden_layer_sizes=(128, 64), learning_rate=constant, solver=adam; total time=34.7min
[CV] END activation=relu, hidden_layer_sizes=(128, 64), learning_rate=constant, solver=adam; total time=34.6min
[CV] END activation=relu, hidden_layer_sizes=(128, 64), le

In [9]:
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)

# Evaluate best model
best_mlp = grid.best_estimator_
y_pred = best_mlp.predict(x_test_bow)
print(classification_report(y_test, y_pred))

Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (256, 128), 'learning_rate': 'constant', 'solver': 'adam'}
Best Cross-Validation Accuracy: 0.9880671977831659
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     11951
           1       0.99      0.99      0.99     12795

    accuracy                           0.99     24746
   macro avg       0.99      0.99      0.99     24746
weighted avg       0.99      0.99      0.99     24746



### Save the model and best params

In [10]:
joblib.dump(best_mlp, os.path.join(base_dir, 'models', 'MLP.joblib'))
with open(os.path.join(base_dir, 'models', 'MLP.json'), 'w') as f:
    f.write(json.dumps(grid.best_params_))

## Train and MLP using best parameters on TF-IDF

In [8]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=True, stop_words='english', max_features=100000)
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [9]:
# load best params from file
with open(os.path.join(base_dir, "models", "MLP.json"), "r") as f:
    best_params = json.load(f)

In [10]:
mlp = MLPClassifier(max_iter=200, random_state=17, **best_params)
mlp.fit(x_train_tfidf, y_train)

y_pred = mlp.predict(x_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     11951
           1       0.99      0.99      0.99     12795

    accuracy                           0.99     24746
   macro avg       0.99      0.99      0.99     24746
weighted avg       0.99      0.99      0.99     24746



In [13]:
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9867049220075972


In [12]:
joblib.dump(mlp, os.path.join(base_dir, 'models', 'MLP_TFIDF.joblib'))

['/Users/aki/Master/Advanced Machine Learning/Project/aml_spam/models/MLP_TFIDF.joblib']