In [1]:
import pandas as pd

messages = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t',
                           names=["label", "message"])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
messages['message'].loc[231]

'Get down in gandhipuram and walk to cross cut road. Right side &lt;#&gt; street road and turn at first right.'

In [4]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [14]:
corpus = []

for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages["message"][i])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)

  corpus.append(review)

In [62]:
corpus[1]

'ok lar joke wif u oni'

In [27]:
# creating bow model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True)
X = cv.fit_transform(corpus).toarray()

In [17]:
y = pd.get_dummies(messages['label'])

In [19]:
y = y.iloc[:,1].values

In [21]:
y = y.astype('int')

In [22]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [23]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [31]:
y_train.shape

(4457,)

In [32]:
from sklearn.naive_bayes import MultinomialNB
spam_detector = MultinomialNB()
spam_detector.fit(X_train, y_train)

In [34]:
#prediction
y_pred=spam_detector.predict(X_test)
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(score)

0.9865470852017937


In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       960
           1       0.94      0.97      0.95       155

    accuracy                           0.99      1115
   macro avg       0.97      0.98      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [49]:
from sklearn.naive_bayes import MultinomialNB
spam_detector = MultinomialNB()
spam_detector.fit(X_train, y_train)

In [50]:
#prediction
y_pred=spam_detector.predict(X_test)
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(score)

0.9811659192825112


**Word2Vec**

In [51]:
!pip install gensim



In [52]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')



In [53]:
vec_ = wv['king']

In [55]:
vec_.shape

(300,)

In [56]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [58]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [64]:
corpu = []

for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
  review = review.lower()
  review = review.split()

  review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)

  corpu.append(review)

In [67]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [68]:
words = []

for sent in corpus:
  sent_token = sent_tokenize(sent)
  for send in sent_token:
    words.append(simple_preprocess(send))

In [70]:
words[1]

['ok', 'lar', 'joke', 'wif', 'oni']

In [71]:
import gensim

In [72]:
# train word2vec from scratch
model = gensim.models.Word2Vec(words, window=5, min_count=2)

In [75]:
model.wv.index_to_key[1]

'go'

In [76]:
model.corpus_count

5564

In [77]:
model.epochs

5

In [78]:
model.wv.most_similar('good')

[('thing', 0.9997411966323853),
 ('amp', 0.9996916055679321),
 ('love', 0.9996747970581055),
 ('said', 0.9996510148048401),
 ('feel', 0.9996461868286133),
 ('im', 0.9996460676193237),
 ('like', 0.9996415376663208),
 ('make', 0.999640703201294),
 ('day', 0.9996342658996582),
 ('much', 0.9996215105056763)]

In [79]:
model.wv.similarity('good', 'bad')

0.99924254

In [80]:
model.wv['good'].shape

(100,)

In [81]:
import numpy as np

In [82]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)

    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)



In [83]:
!pip install tqdm
from tqdm import tqdm



In [89]:
words[72]

['hi', 'babe', 'im', 'home', 'wanna', 'someth', 'xx']

In [90]:
# for all sentences
X = []

for i in tqdm(range(len(words))):
  X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5564/5564 [00:00<00:00, 7787.87it/s]


In [91]:
type(X)

list

In [93]:
X_new = np.array(X, dtype=object)

In [94]:
X_new[3]

array([-0.15999818,  0.3349836 ,  0.05681113, -0.06844743,  0.00136097,
       -0.5842459 ,  0.10774758,  0.6164934 , -0.22353071, -0.21462844,
       -0.17961109, -0.46830764, -0.06124707,  0.21660928,  0.12690243,
       -0.22082818, -0.02644612, -0.28381202,  0.02732611, -0.560147  ,
        0.19878538,  0.23982044,  0.0876468 , -0.17384493, -0.0483872 ,
        0.00191054, -0.17610861, -0.38171518, -0.29908136,  0.1159063 ,
        0.25121096,  0.147217  ,  0.12720968, -0.09970454, -0.16393954,
        0.3791065 , -0.05714875, -0.24836887, -0.19365759, -0.6140062 ,
       -0.01794248, -0.25946367, -0.04758632,  0.08839929,  0.32513493,
       -0.15470581, -0.26174456, -0.0109652 ,  0.13788189,  0.21711029,
        0.1644689 , -0.345886  , -0.16756608, -0.11507108, -0.23241423,
        0.14738953,  0.22335948, -0.0100357 , -0.35984543,  0.08063518,
        0.11123413,  0.10754061, -0.08591316, -0.11078011, -0.41618654,
        0.22961068,  0.1855566 ,  0.2724086 , -0.2849926 ,  0.34