In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt

In [None]:
file_path = "../datasets/spam_ham_dataset.parquet"
df = pd.read_parquet(file_path)

df.info()

In [None]:
X = df["text"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
y = df.loc[:, "label_num"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(C=1000, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
residuals = y_test - y_pred

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["ham", "spam"])
disp.plot(cmap=plt.cm.Greens)
plt.show()

print(cm)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

In [None]:
spam = """
Get your free trial of our new app now!
"""

ham = """
  I'm going to the gym today. Best regards: Diego
"""

X_test = vectorizer.transform([spam, ham])
y_pred = model.predict(X_test)

p = model.predict_proba(X_test)
p