In [14]:
import pandas as pd
import nltk
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
data = {
    "message": [
        "Congratulations you have won a free lottery",
        "Call me when you are free",
        "Win cash prizes now",
        "Let us meet tomorrow",
        "You have been selected for a prize",
        "Are we still meeting today"
    ],
    "label": ["Spam", "Ham", "Spam", "Ham", "Spam", "Ham"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,message,label
0,Congratulations you have won a free lottery,Spam
1,Call me when you are free,Ham
2,Win cash prizes now,Spam
3,Let us meet tomorrow,Ham
4,You have been selected for a prize,Spam
5,Are we still meeting today,Ham


In [17]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df["clean_message"] = df["message"].apply(clean_text)
df

Unnamed: 0,message,label,clean_message
0,Congratulations you have won a free lottery,Spam,congratulations you have won a free lottery
1,Call me when you are free,Ham,call me when you are free
2,Win cash prizes now,Spam,win cash prizes now
3,Let us meet tomorrow,Ham,let us meet tomorrow
4,You have been selected for a prize,Spam,you have been selected for a prize
5,Are we still meeting today,Ham,are we still meeting today


In [18]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(df["clean_message"])
y = df["label"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [20]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [23]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [24]:
sample_message = ["Win a free iPhone now"]
sample_clean = [clean_text(sample_message[0])]
sample_vector = vectorizer.transform(sample_clean)

prediction = model.predict(sample_vector)
print("Message Type:", prediction[0])

Message Type: Spam
