In [1]:
import pandas as pd
import numpy as np

In [69]:
df = pd.read_csv('spam_classification_3col.csv')
df.head()

Unnamed: 0,text_col_1,text_col_2,text_col_3,label
0,meeting schedule schedule discussion review,update meeting report meeting schedule,client meeting project review client,not_spam
1,money prize offer prize urgent,win free limited win offer,offer prize urgent prize credit,spam
2,reminder team client update review,update client discussion meeting schedule,schedule meeting schedule project schedule,not_spam
3,team team schedule schedule reminder,review review project discussion project,team reminder review client review,not_spam
4,discussion schedule meeting team schedule,schedule reminder update team review,team reminder report schedule schedule,not_spam


In [70]:
df[df['label'] == 'spam']

Unnamed: 0,text_col_1,text_col_2,text_col_3,label
1,money prize offer prize urgent,win free limited win offer,offer prize urgent prize credit,spam
9,click money credit click win,limited urgent buy money free,click win money click click,spam
10,urgent click limited prize offer,click urgent click free limited,money buy credit prize free,spam
11,buy buy buy credit urgent,limited offer buy win prize,click free buy buy click,spam
12,limited buy prize money offer,limited prize limited limited prize,free buy click buy offer,spam
...,...,...,...,...
286,limited win win credit buy,credit urgent money click credit,limited click limited buy win,spam
288,offer win free win money,click prize offer win limited,credit credit credit free prize,spam
293,limited urgent limited buy free,offer prize free win credit,prize free offer limited limited,spam
295,money urgent offer free click,credit urgent offer urgent offer,win click prize urgent prize,spam


In [71]:
df.nunique()

text_col_1    300
text_col_2    300
text_col_3    299
label           2
dtype: int64

In [72]:
df.shape

(300, 4)

In [73]:
df['label'] = df['label'].map({'spam': 1, 'not_spam': 0})

In [74]:
for col in ['text_col_1', 'text_col_2', 'text_col_3']:
    df[f'{col}_word_count'] = df[col].str.split().apply(len)
    df[f'{col}_unique_words'] = df[col].apply(lambda x: len(set(x.split())))
    df[f'{col}_has_money'] = df[col].apply(lambda x: 1 if 'money' in x else 0)
    df[f'{col}_has_click'] = df[col].apply(lambda x: 1 if 'click' in x else 0)

In [75]:
df.head()

Unnamed: 0,text_col_1,text_col_2,text_col_3,label,text_col_1_word_count,text_col_1_unique_words,text_col_1_has_money,text_col_1_has_click,text_col_2_word_count,text_col_2_unique_words,text_col_2_has_money,text_col_2_has_click,text_col_3_word_count,text_col_3_unique_words,text_col_3_has_money,text_col_3_has_click
0,meeting schedule schedule discussion review,update meeting report meeting schedule,client meeting project review client,0,5,4,0,0,5,4,0,0,5,4,0,0
1,money prize offer prize urgent,win free limited win offer,offer prize urgent prize credit,1,5,4,1,0,5,4,0,0,5,4,0,0
2,reminder team client update review,update client discussion meeting schedule,schedule meeting schedule project schedule,0,5,5,0,0,5,5,0,0,5,3,0,0
3,team team schedule schedule reminder,review review project discussion project,team reminder review client review,0,5,3,0,0,5,3,0,0,5,4,0,0
4,discussion schedule meeting team schedule,schedule reminder update team review,team reminder report schedule schedule,0,5,4,0,0,5,5,0,0,5,4,0,0


In [58]:
df.isnull().sum()

text_col_1                 0
text_col_2                 0
text_col_3                 0
label                      0
text_col_1_word_count      0
text_col_1_unique_words    0
text_col_1_has_money       0
text_col_1_has_click       0
text_col_2_word_count      0
text_col_2_unique_words    0
text_col_2_has_money       0
text_col_2_has_click       0
text_col_3_word_count      0
text_col_3_unique_words    0
text_col_3_has_money       0
text_col_3_has_click       0
dtype: int64

In [76]:
X = df.drop(columns=['text_col_1','text_col_2','text_col_3','label'])
y = df['label']

In [79]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [80]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(210, 12) (90, 12) (210,) (90,)


In [81]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [82]:
predictions = model.predict(X_test)

In [89]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score

report = classification_report(y_test, predictions)
print(report)

cm = confusion_matrix(y_test, predictions)
print(cm)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        49
           1       1.00      0.93      0.96        41

    accuracy                           0.97        90
   macro avg       0.97      0.96      0.97        90
weighted avg       0.97      0.97      0.97        90

[[49  0]
 [ 3 38]]


In [93]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [95]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10],       
    'penalty': ['l1', 'l2'],       
    'solver': ['liblinear'],       
    'max_iter': [100, 200, 300]
}

grid = GridSearchCV(model, param_grid, cv=5, verbose=1)
grid.fit(X_train_scaled, y_train)

print("Best Parameters:", grid.best_params_)

y_pred = grid.predict(X_test_scaled)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        49
           1       1.00      0.95      0.97        41

    accuracy                           0.98        90
   macro avg       0.98      0.98      0.98        90
weighted avg       0.98      0.98      0.98        90

