In [8]:
import pandas as pd 
import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB, BernoulliNB 
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc 
import string 
from nltk.corpus import wordnet

In [9]:
data = pd.read_csv('text messages for spam mail.csv')


In [31]:
data.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy .. available bugis n gre...
1,ham,ok lar ... joke wif u oni ...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor ... u c already say ...
4,ham,nah n't think go usf life around though


In [11]:
data.Message[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

***Ways to display sample stopwards and punctuations***

In [12]:
#to display stopwords available file
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [None]:
#stopwords.words('bengali')
#stopwords.words('french')
#stopwords.words('nepali')

In [13]:
#stopwords
stopwords.words('english')


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
#nltk.download('stopwords')
#download if not working

In [14]:
#to display punctuation
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# Step 1 : Remove punctuation and stopwards

In [15]:
stopword = set(stopwords.words('english'))
stopword

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [16]:
def preprocess_text(text):
    if isinstance(text, str):
        # Tokenize the text into individual words
        words = nltk.word_tokenize(text.lower())

        # Remove punctuation
        words = [word for word in words if word not in string.punctuation]

        # Remove stopwords and return the filtered words as a string
        filtered_words = [word for word in words if word not in stopword]

        # Join the filtered words back into a sentence
        filtered_text = ' '.join(filtered_words)

        return filtered_text
    else:
        return ''

In [17]:
data['Message'] = data['Message'].apply(preprocess_text)

In [18]:
data['Message']

0       go jurong point crazy .. available bugis n gre...
1                         ok lar ... joking wif u oni ...
2       free entry 2 wkly comp win fa cup final tkts 2...
3             u dun say early hor ... u c already say ...
4              nah n't think goes usf lives around though
                              ...                        
5567    2nd time tried 2 contact u. u £750 pound prize...
5568                          ü b going esplanade fr home
5569                            pity mood ... suggestions
5570    guy bitching acted like 'd interested buying s...
5571                                       rofl true name
Name: Message, Length: 5572, dtype: object

# Step 2: Lemmatization

In [25]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to NOUN if the POS tag is not recognized

def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos_tag)) for word, pos_tag in pos_tags]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text



In [26]:
data['Message'] = data['Message'].apply(lemmatize_text)

In [30]:
data['Message']

0       go jurong point crazy .. available bugis n gre...
1                           ok lar ... joke wif u oni ...
2       free entry 2 wkly comp win fa cup final tkts 2...
3             u dun say early hor ... u c already say ...
4                 nah n't think go usf life around though
                              ...                        
5567    2nd time try 2 contact u. u £750 pound prize 2...
5568                             ü b go esplanade fr home
5569                             pity mood ... suggestion
5570    guy bitch act like 'd interested buying someth...
5571                                       rofl true name
Name: Message, Length: 5572, dtype: object

# Step 3: TF-IDF vectorizer

In [32]:
vectorize = TfidfVectorizer()
x = vectorize.fit_transform(data['Message'])
y = data['Category']

# Step 4: Fit models

In [33]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = .2, random_state = 42)

# Step 5: Performance evaluation

In [34]:
models = [
    MultinomialNB(),
    BernoulliNB()
]

for model in models:
    model.fit(xtrain, ytrain)

    ypred = model.predict(xtest)
    ypred_proba = model.predict_proba(xtest)[:, 1]

    print(f"Model: {type(model).__name__}")
    print('Accuracy Score =',model.score(xtest, ytest))
    print("Confusion Matrix:")
    print(confusion_matrix(ytest, ypred))
    print("AUC Score:", roc_auc_score(ytest, ypred_proba))

    print('\n')

Model: MultinomialNB
Accuracy Score = 0.9695067264573991
Confusion Matrix:
[[966   0]
 [ 34 115]]
AUC Score: 0.9813873025136521


Model: BernoulliNB
Accuracy Score = 0.9775784753363229
Confusion Matrix:
[[960   6]
 [ 19 130]]
AUC Score: 0.987674906554393




# Step 6: Make Prediction on Sample text 

In [36]:
random_text = 'These messages claim that the recipient has won a lottery of $100000000 and request personal information or payment to receive the supposed winnings. Remember, you have to send $1000 to claim this lottery. Legitimate lotteries and contests do not ask for upfront fees to claim prizes before the deadline'

preprocessed_text = preprocess_text(random_text)
lemmatized_text = lemmatize_text(preprocessed_text)
text_vector = vectorize.transform([lemmatized_text])

for model in models:
    prediction = model.predict(text_vector)
    print(f"Model: {type(model).__name__}")
    print("Prediction:", prediction)
    print('\n')

Model: MultinomialNB
Prediction: ['spam']


Model: BernoulliNB
Prediction: ['spam']


