In [368]:
import pandas as pd

messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [369]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [370]:
messages['message'].loc[451]


'hanks lotsly!'

In [371]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bhaskaradhikari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [372]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [373]:
corpus=[]
stop_word=set(stopwords.words('english'))
for i in range(len(messages)):
   review=re.sub('[^a-zA-Z0-9]',' ',messages['message'][i])
   review=review.lower()
   review=review.split()
   review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
   review = ' '.join(review)
   corpus.append(review)

In [374]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus)
X

<5572x2500 sparse matrix of type '<class 'numpy.int64'>'
	with 41240 stored elements in Compressed Sparse Row format>

In [375]:
map_arr={'ham':0,'spam':1}
y=messages['label'].map(map_arr)
print(y.shape)


(5572,)


In [376]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [377]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB()
spam_detect_model.fit(X_train, y_train)


In [378]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [379]:
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)
print(score)

0.9874439461883409


In [380]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.96      0.96      0.96       160

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [381]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(max_features=2500)
X=tf.fit_transform(corpus)


In [382]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [383]:
spam_detect_model.fit(X_train,y_train)

In [384]:
#prediction
y_pred=spam_detect_model.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(score)

0.9847533632286996


In [385]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       972
           1       0.89      1.00      0.94       143

    accuracy                           0.98      1115
   macro avg       0.95      0.99      0.97      1115
weighted avg       0.99      0.98      0.99      1115



In [386]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)

In [387]:
y_pred=classifier.predict(X_test)
score=accuracy_score(y_pred,y_test)
score

0.9838565022421525

In [388]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       973
           1       0.89      1.00      0.94       142

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.97      1115
weighted avg       0.99      0.98      0.98      1115

