Dataset Link: https://www.kaggle.com/datasets/team-ai/spam-text-message-classification

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
# Kaggle API key

!kaggle datasets download -d team-ai/spam-text-message-classification

spam-text-message-classification.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
import zipfile
zip_ref = zipfile.ZipFile(file="/content/spam-text-message-classification.zip", mode='r')
zip_ref.extractall('/content')
zip_ref.close()

In [6]:
import pandas as pd
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
path = r"/content/SPAM text message 20170820 - Data.csv"
df = pd.read_csv(path)

In [9]:
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [10]:
df.shape

(5572, 2)

In [11]:
# Check null values

df.isnull().sum()

Category    0
Message     0
dtype: int64

In [12]:
# See the distribuation of target column

df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [13]:
df.loc[3, 'Message']

'U dun say so early hor... U c already then say...'

In [14]:
re.sub(pattern=r"[^a-zA-Z]", repl=" ", string=df.loc[3, 'Message'])

'U dun say so early hor    U c already then say   '

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
# text preprocessing

corpus = []

for i in range(len(df)):
    text = re.sub(pattern=r"[^a-zA-Z]", repl=" ", string=df.loc[i, 'Message'])
    text = text.lower()
    text = text.split() # return list of words

    text = [lemmatizer.lemmatize(word=word) for word in text if word not in stopwords.words('english')]
    text = " ".join(text)

    corpus.append(text)

In [17]:
# List of preprocess sentences

corpus[0:10]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free']

In [18]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [19]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [21]:
print(df.loc[5, 'Message'])

FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv


In [20]:
# it will apply some preprocess and return word tokenize

simple_preprocess(doc=df.loc[5, 'Message'])

['freemsg',
 'hey',
 'there',
 'darling',
 'it',
 'been',
 'week',
 'now',
 'and',
 'no',
 'word',
 'back',
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still',
 'tb',
 'ok',
 'xxx',
 'std',
 'chgs',
 'to',
 'send',
 'to',
 'rcv']

In [22]:
sent_tokenize(text=corpus[0])

['go jurong point crazy available bugis n great world la e buffet cine got amore wat']

In [23]:
words = []

for i in corpus:
    word = simple_preprocess(doc=i)
    words.append(word)

In [24]:
len(words)

5572

In [25]:
# List of list. Inside list contain every sentence as a tokenize form.
words[0:10]

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [26]:
import gensim
import gensim.downloader as api

In [26]:
# This is word2vec pre-trained model which is trained on google news
# evey word have 300 dimension features vector

wv = api.load('word2vec-google-news-300')



In [27]:
vec_king = wv['king']

In [28]:
vec_king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [29]:
vec_king.shape

(300,)

In [None]:
# def __init__(sentences=None, 
#              corpus_file=None, 
#              size=100, alpha=0.025, 
#              window=5, min_count=5, 
#              max_vocab_size=None, 
#              sample=0.001, seed=1, workers=3, 
#              min_alpha=0.0001, sg=0, hs=0, negative=5, 
#              ns_exponent=0.75, cbow_mean=1, hashfxn=hash, 
#              iter=5, null_word=0, trim_rule=None, 
#              sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, 
#              compute_loss=False, callbacks=(), max_final_vocab=None)

In [27]:
### Lets train Word2vec from scratch for our own corpus

# here, words is our list of list. inner list contains word tokenizer 
# that means all words of a sentence

# min_count=2 means a word will be consider if the word exists minimum 2 times in our corpus
model = gensim.models.Word2Vec(sentences=words, window=5, min_count=2)



In [28]:
# Number of unique words in our corpus

len(model.wv.index2entity)

3540

In [29]:
model.wv.index2entity[0:10]

['call', 'get', 'ur', 'gt', 'go', 'lt', 'ok', 'day', 'free', 'know']

In [30]:
len(model.wv.index2word)

3540

In [31]:
model.wv.index2word[0:10]

['call', 'get', 'ur', 'gt', 'go', 'lt', 'ok', 'day', 'free', 'know']

In [32]:
model.corpus_count

5572

In [33]:
model.epochs

5

In [34]:
model.wv.similar_by_word(word="kid", topn=15)

[('age', 0.9990001916885376),
 ('game', 0.9989858865737915),
 ('best', 0.9989431500434875),
 ('yes', 0.9989081621170044),
 ('work', 0.9988887906074524),
 ('plus', 0.9988856315612793),
 ('yo', 0.9988827109336853),
 ('customer', 0.9988759160041809),
 ('cost', 0.9988712072372437),
 ('box', 0.9988601207733154),
 ('get', 0.9988587498664856),
 ('went', 0.9988584518432617),
 ('cash', 0.9988583922386169),
 ('please', 0.9988570213317871),
 ('oh', 0.9988563060760498)]

In [35]:
# Every word is represented 100 dimensional features vector

model.wv['kid']

array([-0.10516834, -0.02269057, -0.04734379, -0.09557092, -0.03819584,
        0.03429196,  0.07479838,  0.01577865, -0.10699267, -0.01907971,
       -0.02248671,  0.08691894,  0.01420994,  0.05650796, -0.13937494,
       -0.11310227,  0.0049842 , -0.05414366, -0.11004584, -0.06439323,
       -0.06720372, -0.04364338,  0.06025408,  0.0199883 , -0.01786945,
        0.08050638,  0.09764443, -0.0193694 ,  0.04264391,  0.05703606,
        0.02076781, -0.06735208, -0.01111178, -0.07442364,  0.00482114,
        0.06553613, -0.13074227,  0.01136053,  0.04279895,  0.06627662,
       -0.02496226, -0.03185632, -0.02617375,  0.07285105,  0.05578927,
       -0.05288224, -0.06501235, -0.08693227, -0.10971463,  0.00824292,
       -0.07883868, -0.0349388 ,  0.0573584 , -0.05145945,  0.06997936,
        0.06049981,  0.00191342,  0.08025155, -0.16477615, -0.08639488,
        0.01249336, -0.04320706, -0.0430557 ,  0.08058712,  0.09568198,
        0.05096142,  0.06229369, -0.0991145 ,  0.10107716,  0.06

In [36]:
model.wv['kid'].shape

(100,)

In [37]:
# keep in mind if your word not present in your own corpus or vocabulary
# then it will give you error
model.wv['alamin']

KeyError: ignored

In [38]:
# what if we apply 'alamin' on pre-trained model also give you error

In [39]:
import numpy as np

In [44]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [43]:
'go' in model.wv.index2word

True

In [45]:
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index2word], axis=0)
        

In [46]:
from tqdm import tqdm

In [47]:
len(words)

5572

In [48]:
# apply for the entire sentences

X = []

for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5572/5572 [00:00<00:00, 6053.33it/s]


In [50]:
type(X)

list

In [51]:
len(X)

5572

In [53]:
X[0:3]

[array([-0.28086072, -0.07450065, -0.1414689 , -0.2716724 , -0.10589484,
         0.10520037,  0.21387501,  0.03530839, -0.28152874, -0.04076406,
        -0.0655301 ,  0.23947178,  0.04389069,  0.16564603, -0.38271973,
        -0.3236284 ,  0.00184614, -0.15150341, -0.29030845, -0.17494808,
        -0.1920072 , -0.11423212,  0.16833234,  0.06105424, -0.04452452,
         0.22437726,  0.28089687, -0.06703775,  0.10765088,  0.14861007,
         0.05527568, -0.19373685, -0.04238017, -0.21182996,  0.01571254,
         0.16689126, -0.3546991 ,  0.03364808,  0.11758693,  0.17601685,
        -0.05546058, -0.0998951 , -0.06570843,  0.191144  ,  0.16788332,
        -0.14032246, -0.18546484, -0.24435943, -0.29418814,  0.02515033,
        -0.22282608, -0.10027228,  0.15577203, -0.14960046,  0.18916456,
         0.15212975,  0.01523546,  0.23370767, -0.45681083, -0.22715016,
         0.04005737, -0.11618716, -0.12211725,  0.21050508,  0.2572911 ,
         0.12874807,  0.16753392, -0.28457236,  0.2

In [54]:
X[0]

array([-0.28086072, -0.07450065, -0.1414689 , -0.2716724 , -0.10589484,
        0.10520037,  0.21387501,  0.03530839, -0.28152874, -0.04076406,
       -0.0655301 ,  0.23947178,  0.04389069,  0.16564603, -0.38271973,
       -0.3236284 ,  0.00184614, -0.15150341, -0.29030845, -0.17494808,
       -0.1920072 , -0.11423212,  0.16833234,  0.06105424, -0.04452452,
        0.22437726,  0.28089687, -0.06703775,  0.10765088,  0.14861007,
        0.05527568, -0.19373685, -0.04238017, -0.21182996,  0.01571254,
        0.16689126, -0.3546991 ,  0.03364808,  0.11758693,  0.17601685,
       -0.05546058, -0.0998951 , -0.06570843,  0.191144  ,  0.16788332,
       -0.14032246, -0.18546484, -0.24435943, -0.29418814,  0.02515033,
       -0.22282608, -0.10027228,  0.15577203, -0.14960046,  0.18916456,
        0.15212975,  0.01523546,  0.23370767, -0.45681083, -0.22715016,
        0.04005737, -0.11618716, -0.12211725,  0.21050508,  0.2572911 ,
        0.12874807,  0.16753392, -0.28457236,  0.2747394 ,  0.16

In [55]:
# now every full sentence is represented 100 dimension vector space
X[0].shape

(100,)