In [2]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score 
# 2. Load Data directly from GitHub 
url = "https://raw.githubusercontent.com/ramar92/NLP-Dataset/main/spam.csv" 
df = pd.read_csv(url, encoding='latin-1')[['v1', 'v2']]  # Keep only 'v1' (label) and 'v2' 

# Rename columns 
df.columns = ['label', 'message'] 
# 3. Preprocessing 
# Convert 'ham' to 0 and 'spam' to 1 
df['label_num'] = df.label.map({'ham': 0, 'spam': 1}) 
X = df['message'] 
y = df['label_num'] 
# 4. Feature Extraction using CountVectorizer 
vectorizer = CountVectorizer() 
X_vec = vectorizer.fit_transform(X) 
# 5. Train-Test Split 
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, 
random_state=42)

In [5]:
model = MultinomialNB() 
model.fit(X_train, y_train) 
 
# 7. Evaluation 
y_pred = model.predict(X_test) 
print("Accuracy:", accuracy_score(y_test, y_pred)) 
 
# 8. Custom Prediction Function 
def predict_spam(text): 
    text_vec = vectorizer.transform([text]) 
    pred = model.predict(text_vec) 
    return "Spam" if pred[0] else "Not Spam" 
 
# Example Predictions 
print(predict_spam("Congratulations! You've won a free cruise. Reply WIN to claim.")) 
print(predict_spam("Hi John, can we reschedule our meeting?"))

Accuracy: 0.97847533632287
Spam
Not Spam
