In [52]:
# import necessary packages
import numpy as np
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\24746\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\24746\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\24746\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [53]:
# read data
data = pd.read_csv('data/spam.csv', encoding='ISO-8859-1')

# drop extra column
data = data.dropna(how="any", axis=1)
data.columns = ['label', 'text']

data = data.dropna(how="any", axis=1)

In [54]:
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [55]:
data['text'] = data['text'].apply(lambda s: s.lower()).str.strip()
data['text'] = data['text'].apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace),'')))
data['text'] = data['text'].drop_duplicates()
# replace non alphabeths with spaces, and collapse spaces
data['text'] = data['text'].replace(r'[^A-Za-z\d]', ' ', regex=True)
data['text'] = data['text'].replace(r'\s+', ' ', regex=True)

In [56]:
# message length
data['length'] = data['text'].astype(str).apply(lambda x: len(x.split(' '))-1)
data.head()

Unnamed: 0,label,text,length
0,ham,go until jurong point crazy available only in ...,20
1,ham,ok lar joking wif u oni,6
2,spam,free entry in 2 a wkly comp to win fa cup fina...,32
3,ham,u dun say so early hor u c already then say,11
4,ham,nah i don t think he goes to usf he lives arou...,13


In [57]:
# tokenization
data['sentence'] = data['text'].astype(str).map(sent_tokenize)

data['words'] = data['sentence'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
data['words_cleaned'] = data['words'].apply(lambda word_lists: [word for sublist in word_lists for word in sublist if word.isalnum()])

# remove stop words
stop_words = set(stopwords.words('english'))
words_cleaned = data['words_cleaned']

new_filtered_words = [word for word in words_cleaned if word not in stopwords.words('english')]

# emoticons, symbols
emoticon_pattern = re.compile("["  
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002500-\U00002BEF"  # Chinese characters
    u"\U00002702-\U000027B0"
    "]+", flags=re.UNICODE)

data = data.replace(emoticon_pattern, '', regex=True)

# remove extra columns which save tokenized sent and word
# data = data.drop(['words','sentence'], axis=1) 

# stemming
ps = PorterStemmer()
data['stemmed_words'] = data['words_cleaned'].map(lambda words: [ps.stem(word) for word in words])

# lemmatization
lemmatizer = WordNetLemmatizer()
data['lemmatized_words'] = data['words_cleaned'].map(lambda words: [lemmatizer.lemmatize(word) for word in words])

data.to_csv('data/results.csv', index=False)
data

Unnamed: 0,label,text,length,sentence,words,words_cleaned,stemmed_words,lemmatized_words
0,ham,go until jurong point crazy available only in ...,20,[go until jurong point crazy available only in...,"[[go, until, jurong, point, crazy, available, ...","[go, until, jurong, point, crazy, available, o...","[go, until, jurong, point, crazi, avail, onli,...","[go, until, jurong, point, crazy, available, o..."
1,ham,ok lar joking wif u oni,6,[ok lar joking wif u oni],"[[ok, lar, joking, wif, u, oni]]","[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,32,[free entry in 2 a wkly comp to win fa cup fin...,"[[free, entry, in, 2, a, wkly, comp, to, win, ...","[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entri, in, 2, a, wkli, comp, to, win, f...","[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,u dun say so early hor u c already then say,11,[u dun say so early hor u c already then say],"[[u, dun, say, so, early, hor, u, c, already, ...","[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, so, earli, hor, u, c, alreadi, t...","[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,nah i don t think he goes to usf he lives arou...,13,[nah i don t think he goes to usf he lives aro...,"[[nah, i, don, t, think, he, goes, to, usf, he...","[nah, i, don, t, think, he, goes, to, usf, he,...","[nah, i, don, t, think, he, goe, to, usf, he, ...","[nah, i, don, t, think, he, go, to, usf, he, l..."
...,...,...,...,...,...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,32,[this is the 2nd time we have tried 2 contact ...,"[[this, is, the, 2nd, time, we, have, tried, 2...","[this, is, the, 2nd, time, we, have, tried, 2,...","[thi, is, the, 2nd, time, we, have, tri, 2, co...","[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,will b going to esplanade fr home,7,[will b going to esplanade fr home],"[[will, b, going, to, esplanade, fr, home]]","[will, b, going, to, esplanade, fr, home]","[will, b, go, to, esplanad, fr, home]","[will, b, going, to, esplanade, fr, home]"
5569,ham,pity was in mood for that so any other suggest...,10,[pity was in mood for that so any other sugges...,"[[pity, was, in, mood, for, that, so, any, oth...","[pity, was, in, mood, for, that, so, any, othe...","[piti, wa, in, mood, for, that, so, ani, other...","[pity, wa, in, mood, for, that, so, any, other..."
5570,ham,the guy did some bitching but i acted like i d...,26,[the guy did some bitching but i acted like i ...,"[[the, guy, did, some, bitching, but, i, acted...","[the, guy, did, some, bitching, but, i, acted,...","[the, guy, did, some, bitch, but, i, act, like...","[the, guy, did, some, bitching, but, i, acted,..."


In [58]:
data['text2'] = pd.Series([
    'Todays Voda numbers ending 1225 are selected to receive a å£50award. \'',
    '     If you have a match please call 08712300220 quoting claim code 3100 standard rates app")',
    'todays voda numbers ending selected receive award match quoting claim code standard rates app'
    ])

# remove extra columns which save tokenized sent and word
data = data.drop(['words','sentence','stemmed_words', 'lemmatized_words'], axis=1) 
data

Unnamed: 0,label,text,length,words_cleaned,text2
0,ham,go until jurong point crazy available only in ...,20,"[go, until, jurong, point, crazy, available, o...",Todays Voda numbers ending 1225 are selected t...
1,ham,ok lar joking wif u oni,6,"[ok, lar, joking, wif, u, oni]",If you have a match please call 087123002...
2,spam,free entry in 2 a wkly comp to win fa cup fina...,32,"[free, entry, in, 2, a, wkly, comp, to, win, f...",todays voda numbers ending selected receive aw...
3,ham,u dun say so early hor u c already then say,11,"[u, dun, say, so, early, hor, u, c, already, t...",
4,ham,nah i don t think he goes to usf he lives arou...,13,"[nah, i, don, t, think, he, goes, to, usf, he,...",
...,...,...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,32,"[this, is, the, 2nd, time, we, have, tried, 2,...",
5568,ham,will b going to esplanade fr home,7,"[will, b, going, to, esplanade, fr, home]",
5569,ham,pity was in mood for that so any other suggest...,10,"[pity, was, in, mood, for, that, so, any, othe...",
5570,ham,the guy did some bitching but i acted like i d...,26,"[the, guy, did, some, bitching, but, i, acted,...",


In [59]:
# Use labelEncoder method to convert class target
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit_transform(data['text'].astype(str))

array([1174, 3263, 1075, ..., 3418, 4063, 3558])

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['text'].astype(str))
vectorizer.get_feature_names_out()
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [61]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2 = vectorizer2.fit_transform(data['text'].astype(str))
vectorizer2.get_feature_names_out()

array(['00 in', '00 per', '00 sub', ..., 'zoom to', 'zouk with',
       'zyada kisi'], dtype=object)

In [62]:
print(X2.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [63]:
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df_bow

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
sorted_terms = df_bow.sum(axis=0).sort_values(ascending=False)
print("\nTop 5 terms:")
print(sorted_terms.head(5))

print("\nBottom 5 terms:")
print(sorted_terms.tail(5))


Top 5 terms:
you    2133
to     2055
the    1232
and     929
in      825
dtype: int64

Bottom 5 terms:
young          1
younger        1
youphone       1
02072069400    1
0125698789     1
dtype: int64
