In [26]:
!pip install gensim



Dataset: https://archive.ics.uci.edu/dataset/228/sms+spam+collection

In [27]:
from google.colab import drive
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from gensim.models import Word2Vec
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "SVM": LinearSVC(),
}

models_w2v = {
    "Logistic Regression": LogisticRegression(),
    "SVM": LinearSVC()
}

results = []

In [29]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [30]:
df = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/sms/SMSSpamCollection",
                 sep="\t", header=None, names=["label", "message"])

print(df.head())
print(df["label"].value_counts())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4825
spam     747
Name: count, dtype: int64


In [31]:
stop_words = set(stopwords.words("english"))

In [32]:
def preprocess_text(text):
  text = text.lower()
  for punctuation in string.punctuation:
    text = text.replace(punctuation, '')
  tokens = [word for word in text.split() if word not in stop_words]
  return " ".join(tokens)

In [33]:
df["tokens"] = df["message"].apply(preprocess_text)
df.head()

Unnamed: 0,label,message,tokens
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


# TF-IDF models

In [34]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["tokens"])
y = df["label"].map({"ham": 0, "spam": 1})

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
for name, clf in models.items():
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)

  results.append(
      {
          "technique": "TF-IDF",
          "model": name,
          "accuracy": accuracy_score(y_test, y_pred),
          "Precision": precision_score(y_test, y_pred),
          "Recall": recall_score(y_test, y_pred),
          "F1-score": f1_score(y_test, y_pred)
      }
  )

In [37]:
results_df = pd.DataFrame(results)
print(results_df)

  technique                model  accuracy  Precision    Recall  F1-score
0    TF-IDF          Naive Bayes  0.970404   1.000000  0.778523  0.875472
1    TF-IDF  Logistic Regression  0.954260   0.971154  0.677852  0.798419
2    TF-IDF                  SVM  0.982960   0.992424  0.879195  0.932384


# Word2Vec models

In [39]:
w2v = Word2Vec(
    sentences=df['tokens'],
    vector_size=100,
    window=5,
    min_count=5,
    workers=4,
    sg = 1
)



In [40]:
def get_text_vector(tokens, model):
  vectors = [model.wv[word] for word in tokens if word in model.wv]
  return np.mean(vectors, axis = 0) if vectors else np.zeros(model.vector_size)

In [41]:
df["vector"] = df["tokens"].apply(lambda x: get_text_vector(x, w2v))

In [42]:
X = np.vstack(df["vector"])
y = df["label"].map({"ham": 0, "spam": 1})

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
for name, clf in models_w2v.items():
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)

  results.append(
      {
          "technique": "Word2Vec",
          "model": name,
          "accuracy": accuracy_score(y_test, y_pred),
          "Precision": precision_score(y_test, y_pred),
          "Recall": recall_score(y_test, y_pred),
          "F1-score": f1_score(y_test, y_pred)
      }
  )

In [45]:
results_df = pd.DataFrame(results)
print(results_df)

  technique                model  accuracy  Precision    Recall  F1-score
0    TF-IDF          Naive Bayes  0.970404   1.000000  0.778523  0.875472
1    TF-IDF  Logistic Regression  0.954260   0.971154  0.677852  0.798419
2    TF-IDF                  SVM  0.982960   0.992424  0.879195  0.932384
3  Word2Vec  Logistic Regression  0.956951   0.954955  0.711409  0.815385
4  Word2Vec                  SVM  0.962332   0.921260  0.785235  0.847826
