### Import Libraries

In [4]:
import numpy as np
import pandas as pd


In [15]:
df=pd.read_csv("spam.csv",encoding='latin1')
df=df.rename(columns={'v1':'label','v2':'text'})
df.head(2)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [8]:
df.shape
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


### Text preprocessing
* tokenization
* stopwords
* stemming/lemmetaization

In [10]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
#lets check for missing data
df.isna().sum()

v1    0
v2    0
dtype: int64

In [26]:
pattern=r"\b\w*t\w*'t\b" #checking words like won't, haven't , isn't etc
matches=df['text'].apply(lambda x: re.findall(pattern,x) if len(re.findall(pattern,x))>0 else None)
matches.notna().sum()


0

In [33]:
df['text']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [48]:
#nltk.download('punkt')
import string
def text_processing(message):
    try:
        corpus=[]
        message=message.lower()
        message=re.sub('[^a-zA-Z0-9]+',' ',message) #specail character regex
        #message=re.sub('\S*\d\S*','',message).strip() #only alphabet regex
        
        #message=message.split() and nltk.word_tokenize does same same work 
        #tokenization -sentence to words
        message=nltk.word_tokenize(message)
    
        for i in message:
            if i.isalnum():
                corpus.append(i)
        message=corpus[:]
        corpus.clear()
    
    #removing stopwords and punctuations
        for i in message:
            if i not in stopwords.words('english') and i not in string.punctuation:
                corpus.append(i.strip())
    
        message=corpus[:]
        corpus.clear()
        #stemming applied on text
        for i in message:
            corpus.append(ps.stem(i))
    
        return corpus
        
        
        
    except Exception as e:
        print(message)
        raise e
df['processed_text']=df['text'].apply(text_processing)
    
            
    
    

In [49]:
df.head()


Unnamed: 0,label,text,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, think, goe, usf, live, around, though]"


In [64]:
#creating the bag of words (words to vectors)
from wordcloud import WordCloud 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer(ngram_range=(2,2),binary=True)
X=cv.fit_transform(df['processed_text'].astype('str')).toarray()
print(X.shape)

(5572, 30638)


In [62]:
#converting categorical value of label column to number
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [65]:
#X and y are converted to numerical of array , lets check X features 
X_features=pd.DataFrame(X,columns=cv.get_feature_names())
X_features.head(5)




Unnamed: 0,00 easter,00 per,00 sub,000 bonu,000 cash,000 homeown,000 pound,000 price,000 prize,000 xma,...,zed pobox,zero save,zhong se,zindgi wo,zoe 18,zoe hit,zogtoriu stare,zoom cine,zouk nichol,zyada kisi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2024)


In [67]:
spam_detect_model=MultinomialNB().fit(X_train,y_train)

In [70]:
y_pred=spam_detect_model.predict(X_test)

In [72]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(accuracy_score(y_test,y_pred))

0.8663677130044843


## Word2Vec

In [79]:
import gensim
from gensim.models import Word2Vec,keyedvectors


In [77]:
#nltk.download('punkt')
import string
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
def text_processing(message):
    try:
        corpus=[]
        message=message.lower()
        message=re.sub('[^a-zA-Z0-9]+',' ',message) #specail character regex
        #message=re.sub('\S*\d\S*','',message).strip() #only alphabet regex
        
        #message=message.split() and nltk.word_tokenize does same same work 
        #tokenization -sentence to words
        message=nltk.word_tokenize(message)
    
        for i in message:
            if i.isalnum():
                corpus.append(i)
        message=corpus[:]
        corpus.clear()
    
    #removing stopwords and punctuations
        for i in message:
            if i not in stopwords.words('english') and i not in string.punctuation:
                corpus.append(i.strip())
    
        message=corpus[:]
        corpus.clear()
        #lemmatization  applied on text
        for i in message:
            corpus.append(lemmatizer.lemmatize(i))
    
        return corpus
        
        
        
    except Exception as e:
        print(message)
        raise e
df['processed_text_Lemmatize']=df['text'].apply(text_processing)
    
            
    
    

In [84]:
#wordtovec - create one single list for all messages
i=0
list_of_messsages=[]
for i , mesg in enumerate(df['processed_text_Lemmatize']):
    if len(mesg)==0:  #printing index where list in empty
        print(i)
    list_of_messsages.append(mesg)
print(len(list_of_messsages))


252
959
2805
3374
4115
4573
4822
5572


In [87]:
#lets create word2vec model from all the words present in list_of_messsages
ram_gt_16g=True
use_google_w2v = False
train_w2v=True
if train_w2v:
    w2vec_model=Word2Vec(list_of_messsages,min_count=5,vector_size=50,window=5)
    
    

In [95]:
w2vec_model.wv.index_to_key #this will print list of words

['u',
 'call',
 '2',
 'get',
 'ur',
 '4',
 'gt',
 'lt',
 'go',
 'ok',
 'free',
 'day',
 'know',
 'got',
 'come',
 'like',
 'good',
 'time',
 'text',
 'love',
 'want',
 'send',
 'need',
 'one',
 'today',
 'txt',
 'r',
 '1',
 'going',
 'home',
 'c',
 'stop',
 'lor',
 'sorry',
 'see',
 'still',
 'take',
 'mobile',
 'n',
 'back',
 'da',
 'reply',
 'k',
 'think',
 'dont',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'pls',
 'please',
 'co',
 'msg',
 'make',
 'dear',
 'night',
 'message',
 'say',
 'well',
 'thing',
 'much',
 'min',
 'claim',
 'great',
 'hope',
 'oh',
 'hey',
 'number',
 'na',
 '3',
 'happy',
 'friend',
 'wat',
 'work',
 'give',
 'yes',
 'way',
 'www',
 'let',
 'e',
 'prize',
 'right',
 'tomorrow',
 'wan',
 'already',
 'ask',
 'said',
 'cash',
 'yeah',
 'really',
 'tone',
 'life',
 'b',
 'amp',
 'meet',
 'babe',
 'im',
 'find',
 'win',
 'morning',
 'miss',
 'last',
 'uk',
 'service',
 'thanks',
 'would',
 'year',
 'also',
 'anything',
 'lol',
 'feel',
 'care',
 'keep

In [96]:
#w2vec_model.corpus_count  #total words count
#w2vec_model.epochs  #how many epochs model used

In [98]:
w2vec_model.wv.similar_by_word('call') #vectors & words releated to call

[('cash', 0.9990086555480957),
 ('prize', 0.9988747239112854),
 ('offer', 0.9988082051277161),
 ('awarded', 0.9988008737564087),
 ('customer', 0.998798668384552),
 ('mobile', 0.9987077116966248),
 ('service', 0.9986602663993835),
 ('landline', 0.998612105846405),
 ('latest', 0.9986038208007812),
 ('reply', 0.998502790927887)]