In [1]:
import pandas as pd
import numpy as np
import gensim.downloader as api
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']
df['Label'] = df['Label'].map({'spam': 1, 'ham': 0})

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

print("Loading Word2Vec model (this may take time)...")
w2v_model = api.load("word2vec-google-news-300")

def vectorize_message(message, model):
    words = preprocess(message)
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X_vectors = np.array([vectorize_message(msg, w2v_model) for msg in df['Message']])
y = df['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

def predict_message_class(model, w2v_model, message):
    vec = vectorize_message(message, w2v_model).reshape(1, -1)
    pred = model.predict(vec)[0]
    return "spam" if pred == 1 else "ham"

# Example:
sample = "You won a free ticket! Reply now!"
print("Predicted Class:", predict_message_class(clf, w2v_model, sample))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading Word2Vec model (this may take time)...
Test Accuracy: 0.9417040358744395
Predicted Class: spam
