In [1]:
import pandas as pd
import nltk
import numpy as np

In [2]:
df=pd.read_csv("E:\ml-bootcamp\section-51\Spam-Ham Project\SMSSpamCollection.csv",sep='\t',names=["result","messages"])

In [3]:
df.head()

Unnamed: 0,result,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
messages=df['messages'].to_list()
print(messages)



In [5]:
# next, we'll perform data cleaning and text preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()


corpus=[]

for i in range(len(messages)):
    # take each sentence
    sentence=messages[i]
    # then remove non english characters
    sentence=re.sub(pattern="[^a-zA-Z]",repl=" ",string=sentence)
    # then convert the sentence to lower
    sentence=sentence.lower()
    # then take the words into a list
    words=sentence.split()
    # and stem each word if they are not a stopword
    words=[lemmatizer.lemmatize(word,pos='v') for word in words if word not in (stopwords.words('english'))]
    # then join back all the words to form a sentence
    word_sentence=" ".join(words)
    # and add the lemmatized sentence back to the corpus list
    corpus.append(word_sentence)

In [6]:
print(corpus)

['go jurong point crazy available bugis n great world la e buffet cine get amore wat', 'ok lar joke wif u oni', 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply', 'u dun say early hor u c already say', 'nah think go usf live around though', 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv', 'even brother like speak treat like aid patent', 'per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune', 'winner value network customer select receivea prize reward claim call claim code kl valid hours', 'mobile months u r entitle update latest colour mobiles camera free call mobile update co free', 'gonna home soon want talk stuff anymore tonight k cry enough today', 'six chance win cash pound txt csh send cost p day days tsandcs apply reply hl info', 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw', 'search righ

In [17]:
# now create a tf-idf model
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf=TfidfVectorizer(max_features=500,ngram_range=(1,2))

# transform the texts to vectors
X=tf_idf.fit_transform(corpus).toarray()

print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [18]:
# view the unique vocabulary
tf_idf.vocabulary_

{'go': 161,
 'point': 334,
 'great': 170,
 'world': 483,
 'get': 157,
 'wat': 464,
 'ok': 299,
 'lar': 222,
 'wif': 473,
 'free': 148,
 'entry': 132,
 'win': 475,
 'st': 399,
 'may': 256,
 'text': 414,
 'receive': 354,
 'question': 346,
 'txt': 446,
 'rate': 348,
 'apply': 18,
 'dun': 120,
 'say': 366,
 'early': 122,
 'already': 9,
 'think': 422,
 'live': 238,
 'around': 20,
 'though': 424,
 'hey': 192,
 'week': 467,
 'word': 481,
 'back': 29,
 'like': 234,
 'fun': 155,
 'still': 402,
 'xxx': 490,
 'send': 374,
 'even': 134,
 'brother': 46,
 'speak': 396,
 'per': 316,
 'set': 377,
 'friends': 151,
 'network': 283,
 'customer': 90,
 'select': 372,
 'prize': 341,
 'claim': 68,
 'call': 51,
 'code': 74,
 'valid': 454,
 'hours': 201,
 'mobile': 268,
 'update': 449,
 'latest': 226,
 'colour': 77,
 'camera': 54,
 'co': 72,
 'free call': 149,
 'gonna': 165,
 'home': 196,
 'soon': 392,
 'want': 463,
 'talk': 411,
 'stuff': 406,
 'tonight': 434,
 'enough': 130,
 'today': 429,
 'chance': 60,
 'c

In [19]:
# now we need to encode our output feature as well
y=pd.get_dummies(df['result'],dtype=int)
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [20]:
# select the spam column as output
y=y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0])

In [21]:
# now do train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

X_train.shape,X_test.shape

((4179, 500), (1393, 500))

In [22]:
# next do the model training
from sklearn.naive_bayes import MultinomialNB

spam_classifier=MultinomialNB()

In [23]:
# train the model
spam_classifier.fit(X_train,y_train)

# and get the predictions
y_pred=spam_classifier.predict(X_test)

In [24]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
# lets test metrics
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score

cm=confusion_matrix(y_true=y_test,y_pred=y_pred)
print(cm)

[[1201    6]
 [  25  161]]


In [26]:
acc_score=accuracy_score(y_true=y_test,y_pred=y_pred)
precision=precision_score(y_true=y_test,y_pred=y_pred)
recall=recall_score(y_true=y_test,y_pred=y_pred)


print(f"Accuracy: {acc_score}")
print(f"Precision score: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9777458722182341
Precision score: 0.9640718562874252
Recall: 0.8655913978494624
