# Step 1: Import Required Libraries

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import gensim.downloader as api
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aratt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Aratt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aratt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Step 2: Load and Prepare the Dataset

In [13]:
# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Keep only the relevant columns (label and message)
df = df[['v1', 'v2']]

# Rename columns for clarity
df.columns = ['label', 'message']

# Convert labels to binary (spam=1, ham=0)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Display first few rows
print(df.head())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


# Step 3: Text Preprocessing

In [15]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

# Apply preprocessing to all messages
df['processed'] = df['message'].apply(preprocess_text)

# Display some processed messages
print(df[['message', 'processed']].head())

                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                           processed  
0  [go, jurong, point, crazy, available, bugis, n...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, wkly, comp, win, fa, cup, final,...  
3      [u, dun, say, early, hor, u, c, already, say]  
4  [nah, dont, think, goes, usf, lives, around, t...  


# Step 4: Load Pre-trained Word2Vec Model

In [17]:
# Load the pre-trained Word2Vec model
print("Loading Word2Vec model...")
w2v_model = api.load('word2vec-google-news-300')
print("Model loaded successfully!")

Loading Word2Vec model...
Model loaded successfully!


# Step 5: Convert Messages to Fixed-Length Vectors

In [25]:
def message_to_vector(message, model, vector_size=300):
    # Initialize an empty vector
    vector = np.zeros(vector_size)
    count = 0
    
    # For each word in the message
    for word in message:
        if word in model.key_to_index:  # Check if word is in vocabulary
            vector += model[word]  # Add word vector
            count += 1
    
    # Average the vectors
    if count != 0:
        vector /= count
    
    return vector

# Convert all messages to vectors
X = np.array([message_to_vector(msg, w2v_model) for msg in df['processed']])
y = df['label'].values

# Check the shape of our feature matrix
print(f"Shape of feature matrix: {X.shape}")

Shape of feature matrix: (5572, 300)


# Step 6: Split Data into Training and Testing Sets

In [28]:
# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 4457
Testing set size: 1115


# Step 7: Train Logistic Regression Classifier

In [31]:
# Initialize and train the classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Predict on test set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9426


# Step 8: Create Prediction Function

In [34]:
def predict_message_class(model, w2v_model, message):
    # Preprocess the input message
    processed_msg = preprocess_text(message)
    
    # Convert to vector
    msg_vector = message_to_vector(processed_msg, w2v_model)
    
    # Reshape for single prediction
    msg_vector = msg_vector.reshape(1, -1)
    
    # Predict
    prediction = model.predict(msg_vector)
    
    # Return human-readable label
    return "spam" if prediction[0] == 1 else "ham"

# Step 9: Example Usage

In [37]:
# Example usage
test_message = "WINNER!! You've been selected for a free vacation. Call now to claim!"
prediction = predict_message_class(classifier, w2v_model, test_message)
print(f"Message: {test_message}")
print(f"Prediction: {prediction}")

test_message2 = "Hey, how about meeting for coffee tomorrow?"
prediction2 = predict_message_class(classifier, w2v_model, test_message2)
print(f"\nMessage: {test_message2}")
print(f"Prediction: {prediction2}")

Message: WINNER!! You've been selected for a free vacation. Call now to claim!
Prediction: spam

Message: Hey, how about meeting for coffee tomorrow?
Prediction: ham
