##### Spam Ham project using Bag of Words (BOW)

Steps:
1) Import dataset
2) Preprocess dataset - convert to lowercase, remove special characters using regex, lemmatize/stem
3) Test-Train split (do this before converting words to vectors so that there is no leakage of data between train and test)
4) Convert words to vectors using TF-IDF 
5) Train using ML algorithm like Naive Bayes
6) Predict and check accuracy

In [1]:
import pandas as pd

In [4]:
# Load the dataset
messages = pd.read_csv('./dataset/SMSSpamCollection.csv', sep='\t', names=['label', 'message'])

In [3]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
import nltk


In [10]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [12]:
lemmatizer = WordNetLemmatizer()

In [14]:
# Preprocess the data

corpus = []

for i in range(0,len(messages)):
    words = messages['message'][i].lower() #change all words to lowercase
    words = re.sub('^a-z',' ', words) # remove special characters
    words = words.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    words = ' '.join(words)
    corpus.append(words)

corpus

['go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...',
 'ok lar... joking wif u oni...',
 "free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's apply 08452810075over18's",
 'u dun say early hor... u c already say...',
 'nah think go usf, life around though',
 "freemsg hey darling 3 week's word back! i'd like fun still? tb ok! xxx std chgs send, £1.50 rcv",
 'even brother like speak me. treat like aid patent.',
 "per request 'melle melle (oru minnaminunginte nurungu vettam)' set callertune callers. press *9 copy friend callertune",
 'winner!! valued network customer selected receivea £900 prize reward! claim call 09061701461. claim code kl341. valid 12 hour only.',
 'mobile 11 month more? u r entitled update latest colour mobile camera free! call mobile update co free 08002986030',
 "i'm gonna home soon want talk stuff anymore tonight, k? i've cried enough today.",
 'six chance win ca

In [21]:
Y = pd.get_dummies(messages['label'], dtype=int)
Y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [22]:
Y = Y['spam'].values
Y

array([0, 0, 1, ..., 0, 0, 0])

In [31]:
# Test train split
from sklearn.model_selection import train_test_split
X_train, X_test,Y_train,Y_test = train_test_split(corpus, Y, test_size=0.3, random_state=3)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1,2))

In [33]:
X_train = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.transform(X_test).toarray()

In [34]:
tfidf.vocabulary_

{'decide': 482,
 'co': 385,
 'si': 1541,
 'going': 729,
 'home': 832,
 'liao': 977,
 'co si': 386,
 'going home': 730,
 'heard': 802,
 'call': 291,
 'night': 1194,
 'on': 1251,
 'make': 1059,
 'like': 982,
 'last': 946,
 'time': 1725,
 'xx': 1970,
 'luv': 1052,
 'net': 1177,
 'pls': 1332,
 'enough': 580,
 'family': 614,
 'hot': 841,
 'sun': 1645,
 'place': 1322,
 'no': 1199,
 'reason': 1414,
 'if': 868,
 'invited': 888,
 'actually': 121,
 'go': 719,
 'wait': 1853,
 'serious': 1519,
 'yup': 1999,
 'wun': 1965,
 'believe': 233,
 'wat': 1874,
 'really': 1413,
 'neva': 1180,
 'msg': 1152,
 'sent': 1517,
 'shuhui': 1540,
 'hi': 815,
 'always': 151,
 'online': 1255,
 'yahoo': 1974,
 'would': 1961,
 'chat': 357,
 'yes': 1984,
 'he': 798,
 'great': 755,
 'told': 1737,
 'kallis': 916,
 'best': 234,
 'world': 1956,
 'tough': 1754,
 'get': 699,
 'out': 1277,
 'die': 502,
 'want': 1864,
 'hey': 812,
 'horny': 838,
 'see': 1500,
 'naked': 1171,
 'text': 1684,
 'charged': 355,
 '150pm': 42,
 'unsubs

In [35]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train,Y_train)



In [38]:
Y_pred = nb.predict(X_test) 

In [39]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score(Y_test, Y_pred)

0.9808612440191388

In [40]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1440
           1       1.00      0.87      0.93       232

    accuracy                           0.98      1672
   macro avg       0.99      0.93      0.96      1672
weighted avg       0.98      0.98      0.98      1672

