In [3]:
import pandas as pd

df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.dropna(inplace=True)

df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
import re
from sklearn.model_selection import train_test_split
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)       
    text = re.sub(r'\d+', ' ', text)       
    text = re.sub(r'\s+', ' ', text)      
    return text.strip()

df['text'] = df['text'].apply(clean_text)
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

In [37]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    max_features=5000,  
    min_df=5,
    max_df=0.9
)

In [39]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

model = MultinomialNB()
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9632286995515695
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [41]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()

param_dist = {
    'C': uniform(loc=0, scale=4),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

random_search = RandomizedSearchCV(
    model, param_distributions=param_dist, n_iter=5, cv=2, verbose=1, n_jobs=-1, random_state=42
)

random_search.fit(X_train_vec, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best CV Score:", random_search.best_score_)

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test_vec)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 2 folds for each of 5 candidates, totalling 10 fits
Best Parameters: {'C': 2.3946339367881464, 'penalty': 'l1', 'solver': 'liblinear'}
Best CV Score: 0.9501904066922636
Test Accuracy: 0.9641255605381166
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       966
           1       0.86      0.87      0.87       149

    accuracy                           0.96      1115
   macro avg       0.92      0.93      0.92      1115
weighted avg       0.96      0.96      0.96      1115



In [42]:
from joblib import dump

dump(best_model, 'spam_model.joblib', compress=3)

dump(vectorizer, 'tfidf_vectorizer.joblib', compress=3)

['tfidf_vectorizer.joblib']