In [1]:
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from collections import Counter

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sugita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# https://www.kaggle.com/code/mfaisalqureshi/email-spam-detection-98-accuracy
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Filter out stopwords, special characters, and symbols
df['Message'] = df['Message'].str.lower().apply(nltk.word_tokenize)
stop_words = set(stopwords.words('english'))
df['Message'] = df['Message'].apply(lambda x: [word for word in x if word.isalpha() and word not in stop_words])

# Count the frequency of every word in the messages and get the 20 most common for both spam and non-spam
spam_messages = df[df['Category'] == 'spam']['Message']
non_spam_messages = df[df['Category'] == 'ham']['Message']

def count_words(messages):
  res = Counter()

  for message in messages:
    for word in message:
      res[word] += 1
    
  return res

top_spam_words = count_words(spam_messages).most_common(20)
top_non_spam_words = count_words(non_spam_messages).most_common(20)

In [4]:
for word, count in top_spam_words:
    print(f"{word}: {count}") 

call: 343
free: 219
txt: 156
ur: 144
u: 138
mobile: 123
text: 121
stop: 118
claim: 111
reply: 104
prize: 90
get: 84
send: 69
new: 69
nokia: 65
cash: 62
urgent: 62
win: 60
service: 55
contact: 55


In [5]:
for word, count in top_non_spam_words:
    print(f"{word}: {count}") 

u: 994
gt: 318
lt: 316
get: 302
go: 251
ur: 247
ok: 246
got: 245
know: 237
like: 233
call: 232
good: 231
come: 230
time: 198
love: 197
day: 191
ü: 173
going: 169
one: 168
want: 164
