In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
messages = pd.read_csv('./data/spam.csv', encoding='latin1')
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
messages = messages[['v1', 'v2']]
messages.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
import re
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\AGLI
[nltk_data]     IT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [24]:
corpus = []
for i in tqdm(range(len(messages))):
    review = messages['v2'][i].split()
    review = [lemmatizer.lemmatize(word.lower()) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

100%|██████████| 5572/5572 [00:38<00:00, 145.55it/s]


In [25]:
corpus[:5]

['go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...',
 'ok lar... joking wif u oni...',
 "free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's apply 08452810075over18's",
 'u dun say early hor... u c already say...',
 'nah i think go usf, life around though']

In [26]:
%%time
tfidf = TfidfVectorizer(max_features=100)
X = tfidf.fit_transform(corpus).toarray()
X

CPU times: total: 109 ms
Wall time: 194 ms


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
X[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.43619427, 0.        , 0.        ,
       0.46142751, 0.54314377, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

# N-Grams

In [33]:
%%time
tfidf = TfidfVectorizer(max_features=200, ngram_range=(2, 2))
X = tfidf.fit_transform(corpus).toarray()
X

CPU times: total: 312 ms
Wall time: 360 ms


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
tfidf.vocabulary_

{'free entry': 57,
 'rate apply': 134,
 'to claim': 161,
 'claim call': 35,
 'claim code': 36,
 'had mobile': 75,
 'free call': 56,
 'chance win': 33,
 'txt word': 169,
 'we re': 186,
 'let know': 97,
 'feel like': 54,
 'please call': 127,
 'call 08000930705': 19,
 'lt gt': 107,
 'missed call': 109,
 'want go': 181,
 'first time': 55,
 'like lt': 98,
 'sms ac': 149,
 'sorry ll': 150,
 'll call': 102,
 'call later': 22,
 'ur awarded': 171,
 'call now': 25,
 'call free': 21,
 'thats cool': 158,
 'how much': 89,
 'hi hi': 84,
 'call customer': 20,
 'customer service': 42,
 'service representative': 147,
 'won guaranteed': 192,
 'guaranteed 1000': 71,
 '1000 cash': 1,
 'you re': 196,
 'we trying': 187,
 'trying contact': 165,
 'contact you': 41,
 'draw show': 49,
 'prize guaranteed': 133,
 'guaranteed call': 72,
 'valid 12hrs': 177,
 'we went': 188,
 'selected receive': 144,
 'speak live': 151,
 'live operator': 101,
 'private your': 131,
 'account statement': 12,
 'identifier code': 92,
 

In [35]:
sorted_vocab = sorted(tfidf.vocabulary_.items(), key=lambda x: x[1])
sorted_vocab

[('08000839402 call2optout', 0),
 ('1000 cash', 1),
 ('10p min', 2),
 ('150p msg', 3),
 ('1st week', 4),
 ('2000 prize', 5),
 ('2003 account', 6),
 ('2lands row', 7),
 ('2nd attempt', 8),
 ('350 award', 9),
 ('750 anytime', 10),
 ('800 un', 11),
 ('account statement', 12),
 ('across sea', 13),
 ('anytime network', 14),
 ('attempt contact', 15),
 ('await collection', 16),
 ('bonus caller', 17),
 ('bt national', 18),
 ('call 08000930705', 19),
 ('call customer', 20),
 ('call free', 21),
 ('call later', 22),
 ('call me', 23),
 ('call mobileupd8', 24),
 ('call now', 25),
 ('caller prize', 26),
 ('camcorder reply', 27),
 ('camera phone', 28),
 ('can get', 29),
 ('can it', 30),
 ('cant pick', 31),
 ('cash await', 32),
 ('chance win', 33),
 ('chat 86688', 34),
 ('claim call', 35),
 ('claim code', 36),
 ('claim ur', 37),
 ('co uk', 38),
 ('come back', 39),
 ('come home', 40),
 ('contact you', 41),
 ('customer service', 42),
 ('dating service', 43),
 ('decimal gt', 44),
 ('do want', 45),
 ('don

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, messages['v1'], test_size=0.2)


In [38]:
model = LogisticRegression(verbose=1)
model.fit(X_train, y_train)

In [41]:
y_preds = model.predict(X_test)

In [46]:
np.unique(y_preds, return_counts=True)

(array(['ham', 'spam'], dtype=object), array([1025,   90], dtype=int64))

In [50]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.9372197309417041