In [14]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder


# Load training and validation data
train_df = pd.read_csv("train-data.tsv", sep='\t', header=None, names=['label', 'text'])
valid_df = pd.read_csv("valid-data.tsv", sep='\t', header=None, names=['label', 'text'])

# Check the first few rows
print(train_df.head())


  label                                               text
0   ham  ahhhh...just woken up!had a bad dream about u ...
1   ham                           you can never do nothing
2   ham  now u sound like manky scouse boy steve,like! ...
3   ham  mum say we wan to go then go... then she can s...
4   ham  never y lei... i v lazy... got wat? dat day ü ...


In [11]:
print("Train dataframe columns:", train_df.columns.tolist())
print("Valid dataframe columns:", valid_df.columns.tolist())

Train dataframe columns: ['label', 'text']
Valid dataframe columns: ['label', 'text']


In [12]:
train_data = train_df['text']
train_target = train_df['label']

valid_data = valid_df['text']
valid_target = valid_df['label']


In [15]:
# Encode 'ham' as 0, 'spam' as 1
label_encoder = LabelEncoder()
train_target_enc = label_encoder.fit_transform(train_target)

In [16]:
# Build pipeline: TF-IDF + Naive Bayes
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

# Train the model
model.fit(train_data, train_target_enc)

In [17]:
def predict_message(text):
    prob_spam = model.predict_proba([text])[0][1]  # probability it's spam
    label = "spam" if prob_spam > 0.5 else "ham"
    return [prob_spam, label]

In [18]:
print(predict_message("Free entry in 2 a weekly competition to win FA Cup final tickets! Text FA to 12345"))
print(predict_message("Hey, want to meet for lunch later?"))

[0.7741848058606982, 'spam']
[0.0013756120339473365, 'ham']
