In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import pickle
from joblib import load
import myFunc

In [2]:
df = pd.read_csv('./data/testSampleChatgpt.csv', encoding='ISO-8859-1')
dfClean = pd.read_csv('./data/cleanedData.csv')

In [3]:
df = df.drop(columns=['Unnamed: 6'])
df = myFunc.cleanDataframe(df)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sender            4 non-null      object
 1   receiver          4 non-null      object
 2   subject           4 non-null      object
 3   body              4 non-null      object
 4   label             4 non-null      int64 
 5   urls              4 non-null      int64 
 6   emailDomain       4 non-null      object
 7   generalConsumer   4 non-null      int64 
 8   govDomain         4 non-null      int64 
 9   eduDomain         4 non-null      int64 
 10  orgDomain         4 non-null      int64 
 11  netDomain         4 non-null      int64 
 12  otherDomain       4 non-null      int64 
 13  html              4 non-null      int64 
 14  fullContent       4 non-null      object
 15  punctuationCount  4 non-null      int64 
 16  subjectLength     4 non-null      int64 
 17  bodyLength        4 

  df['generalConsumer'] = df['emailDomain'].str.contains(publicEmailDomainRegex, regex=True, na=False).astype(int)
  df['govDomain'] = df['emailDomain'].str.contains(govRegex, regex=True, na=False).astype(int)
  df['eduDomain'] = df['emailDomain'].str.contains(eduRegex, regex=True, na=False).astype(int)
  df['orgDomain'] = df['emailDomain'].str.contains(orgRegex, regex=True, na=False).astype(int)
  df['netDomain'] = df['emailDomain'].str.contains(netRegex, regex=True, na=False).astype(int)


In [4]:
# Separate target(label) from predictor columns
y = df.label

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit_transform(dfClean['fullContent'])
tfidf_matrix = tfidf_vectorizer.transform(df['fullContent'])

# Continuous features normalization
scaler = StandardScaler()
contd = scaler.fit_transform(df[['punctuationCount', 'subjectLength', 'bodyLength', 'totalLength']])

# Sparse binary features
sparse_features = csr_matrix(df[["urls", "totalLength", "generalConsumer", "govDomain", "eduDomain", "orgDomain", "netDomain", "otherDomain", "html", "punctuationCount"]].values)

X = hstack([sparse_features, contd, tfidf_matrix])



In [5]:
with open('./model/MLPClassifier_ZiHin.pkl', 'rb') as file: 
    mlpC = pickle.load(file)
# xgbRandomSearch = load('./model/XGBoost_random_sebastian.joblib')
# xgb = XGBClassifier(random_search.best_params_)
print(mlpC)
# print(xgb)

MLPClassifier(early_stopping=True, hidden_layer_sizes=80, max_iter=50,
              random_state=1, verbose=True)


In [7]:
# Evaluate the model
mlpC_pred_prob = mlpC.predict_proba(X)
mlpC_pred = mlpC.predict(X)
print(mlpC_pred_prob)


[[0.31833736 0.68166264]
 [0.04527401 0.95472599]
 [0.85525244 0.14474756]
 [0.96362608 0.03637392]]


In [8]:
print(mlpC_pred)

[1 1 0 0]
