In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('../data/SMSSpamCollection.csv', sep='\t',header=None,names=['label', 'message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

'label' is the target column which is classified as:
- 'ham' for non spam messages
- and 'spam'

In [6]:
df[df.duplicated()]

Unnamed: 0,label,message
103,ham,As per your request 'Melle Melle (Oru Minnamin...
154,ham,As per your request 'Melle Melle (Oru Minnamin...
207,ham,"As I entered my cabin my PA said, '' Happy B'd..."
223,ham,"Sorry, I'll call later"
326,ham,No calls..messages..missed calls
...,...,...
5524,spam,You are awarded a SiPix Digital Camera! call 0...
5535,ham,"I know you are thinkin malaria. But relax, chi..."
5539,ham,Just sleeping..and surfing
5553,ham,Hahaha..use your brain dear


There are duplicated records which can be dropped

In [7]:
df.drop_duplicates(inplace=True)
df=df.reset_index(drop=True)
df.shape

(5169, 2)

In [8]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df['label'].value_counts()

label
ham     4516
spam     653
Name: count, dtype: int64

In [10]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [11]:
lemmatizer=WordNetLemmatizer()

corpus=[]
for i in range(len(df)):
    tokens=re.sub(r'[^a-zA-Z]',' ',df['message'][i]).lower()
    tokens=word_tokenize(tokens)
    tokens=[lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    tokens=' '.join(tokens)
    corpus.append(tokens)

In [12]:
print('original message : ',df['message'][100])
print('lemmatized message : ',corpus[100])

original message :  Please don't text me anymore. I have nothing else to say.
lemmatized message :  please text anymore nothing else say


In [29]:
target = df['label']

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import optuna
from optuna import Trial

In [33]:
def objective(trial:Trial, data=corpus, target=target):
    vectorizer=trial.suggest_categorical('vectorizer',['Count','Tfidf'])
    max_features=trial.suggest_categorical('max_features',[2500,3000,5000,7000,10000])
    
    if vectorizer=='Count':
        vec=CountVectorizer(max_features=max_features)
    else:
        vec=TfidfVectorizer(max_features=max_features)
    
    X=vec.fit_transform(data).toarray()
    X_train,X_test,y_train,y_test=train_test_split(X,target,test_size=0.25,random_state=20)
    model=MultinomialNB()
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    score=accuracy_score(y_test,y_pred)
    return score

In [35]:
study=optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=30)
study.best_trial.params

[I 2024-01-15 16:43:22,405] A new study created in memory with name: no-name-e6a4f97b-a12f-4f70-a06e-582191127a58


[I 2024-01-15 16:43:22,726] Trial 0 finished with value: 0.9814385150812065 and parameters: {'vectorizer': 'Count', 'max_features': 2500}. Best is trial 0 with value: 0.9814385150812065.
[I 2024-01-15 16:43:23,195] Trial 1 finished with value: 0.9582366589327146 and parameters: {'vectorizer': 'Tfidf', 'max_features': 10000}. Best is trial 0 with value: 0.9814385150812065.
[I 2024-01-15 16:43:23,687] Trial 2 finished with value: 0.9582366589327146 and parameters: {'vectorizer': 'Tfidf', 'max_features': 10000}. Best is trial 0 with value: 0.9814385150812065.
[I 2024-01-15 16:43:24,319] Trial 3 finished with value: 0.9783449342614076 and parameters: {'vectorizer': 'Count', 'max_features': 5000}. Best is trial 0 with value: 0.9814385150812065.
[I 2024-01-15 16:43:24,660] Trial 4 finished with value: 0.9651972157772621 and parameters: {'vectorizer': 'Tfidf', 'max_features': 5000}. Best is trial 0 with value: 0.9814385150812065.
[I 2024-01-15 16:43:24,876] Trial 5 finished with value: 0.9729

{'vectorizer': 'Count', 'max_features': 3000}

using CountVectorizer to train the model

In [37]:
cv=CountVectorizer(max_features=3000)
X=cv.fit_transform(corpus).toarray()
y=target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)

model=MultinomialNB()
model.fit(X_train,y_train)

print('training accuracy: ',model.score(X_train,y_train))
y_pred=model.predict(X_test)
print('testing accuracy: ',accuracy_score(y_test,y_pred))

training accuracy:  0.9871001031991744
testing accuracy:  0.982985305491106


In [142]:
df.tail()

Unnamed: 0,label,message
5164,spam,This is the 2nd time we have tried 2 contact u...
5165,ham,Will ü b going to esplanade fr home?
5166,ham,"Pity, * was in mood for that. So...any other s..."
5167,ham,The guy did some bitching but I acted like i'd...
5168,ham,Rofl. Its true to its name


choosing a spam message and predicting using the trained model

In [38]:
# the message is a spam
df['message'][5164]

'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'

In [39]:
msg=df['message'][5164]

In [40]:
data=cv.transform([msg]).toarray()
print(model.predict(data))

['spam']
