In [1]:
import pandas as pd
messages=pd.read_csv('SMSSpamCollection.txt',sep='\t',names=["label","message"])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
import re
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [5]:
corpus=[]
for i in range(len(messages)):
  review=re.sub('[^a-zA-Z]',' ',messages['message'][i])
  review=review.lower()
  review=review.split()
  review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
  review=' '.join(review)
  corpus.append(review)

## Bag of Words Model

In [39]:
# creating a Bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,binary=True,ngram_range=(2,2))
X=cv.fit_transform(corpus).toarray()

In [40]:
X.shape

(5572, 2500)

In [8]:
y=pd.get_dummies(messages['label']).iloc[:,1].values.astype(int)

In [9]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
spam_detect_model=MultinomialNB().fit(X_train,y_train)
y_pred=spam_detect_model.predict(X_test)
score=accuracy_score(y_test,y_pred)
score

0.9730941704035875

In [11]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       985
           1       0.81      1.00      0.90       130

    accuracy                           0.97      1115
   macro avg       0.91      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



## TFIDF Model

In [12]:
# Creating the TFIDF Model
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X=tv.fit_transform(corpus).toarray()

In [13]:
#Train_test_split
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
spam_detect_model=MultinomialNB().fit(X_train,y_train)

In [14]:
y_pred=spam_detect_model.predict(X_test)
score=accuracy_score(y_test,y_pred)
score

0.97847533632287

In [15]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       979
           1       0.85      1.00      0.92       136

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.95      1115
weighted avg       0.98      0.98      0.98      1115



## WORD2VEC

In [None]:
!pip install gensim



In [None]:
import gensim.downloader as api

wv=api.load('word2vec-google-news-300')



In [16]:
from nltk.stem import WordNetLemmatizer
import nltk
import re
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [43]:
corpus=[]
for i in range(len(messages)):
  review=re.sub('[^a-zA-Z]',' ',messages['message'][i])
  review=review.lower()
  review=review.split()
  review=[lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
  review=' '.join(review)
  corpus.append(review)

In [18]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [54]:
words=[]
for sent in corpus:
  words.append(sent.split())

In [57]:
len(words)

5572

In [58]:
import gensim
model=gensim.models.Word2Vec(words,window=5,min_count=2)
len(model.wv.index_to_key)

3571

In [59]:
model.corpus_count

5572

In [60]:
model.wv.similar_by_word('free')
model.wv['kid'].shape

(100,)

In [61]:
import numpy as np
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)

In [None]:
!pip install tqdm



In [62]:
#apply for the entire sentences
from tqdm import tqdm
X=[]
for i in tqdm(range(len(words))):
  X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5572/5572 [00:01<00:00, 3566.96it/s]


In [63]:
# Fix single numeric values
X_fixed = [xi if isinstance(xi, (list, np.ndarray)) else [xi] for xi in X]

# Pad sequences to ensure consistent length
max_len = max(len(xi) for xi in X_fixed)
X_new = np.array([np.pad(xi, (0, max_len - len(xi))) for xi in X_fixed])

In [64]:
y=pd.get_dummies(messages['label']).iloc[:,1].values.astype(int)

In [74]:
#Train_test_split
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.2,random_state=0)


In [75]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')  # Replace 'mean' with 'median' or 'most_frequent' as needed
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


In [76]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()  # Scales data to a range of [0, 1]
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [78]:
spam_detect_w2v_model=MultinomialNB().fit(X_train,y_train)

In [81]:
y_pred=spam_detect_w2v_model.predict(X_test)
score=accuracy_score(y_test,y_pred)
score

0.8466367713004485

In [82]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.86      0.92      1102
           1       0.01      0.08      0.01        13

    accuracy                           0.85      1115
   macro avg       0.50      0.47      0.46      1115
weighted avg       0.98      0.85      0.91      1115

