In [1]:
import pandas as pd
import chardet
with open('SMSSpamCollection.csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']


In [2]:
df=pd.read_csv('SMSSpamCollection.csv',sep="\t",names=["label","message"],encoding=encoding)

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [5]:
df['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."


**Data Cleaning and Preprocessing**


1.   Text Preprocessing
  *   Tokenization
  *   Stop Words
  *   Stemming
  *   Lemmetization
  *   NLTK Library

2.   Word Embeddings
  *   BOW
  *   TFIDF
  *   Word2Vec
  *   AvgWord2Vec





In [6]:

import nltk
import string
#download resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [8]:
#tokenize text
df["message_preprocess"]=df['message'].apply(lambda x: word_tokenize(x))
df["message_preprocess"].head()

0    [Go, until, jurong, point, ,, crazy, .., Avail...
1             [Ok, lar, ..., Joking, wif, u, oni, ...]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, ..., U, c, alrea...
4    [Nah, I, do, n't, think, he, goes, to, usf, ,,...
Name: message_preprocess, dtype: object

In [9]:
df["message_preprocess"].loc[100]



['Please',
 'do',
 "n't",
 'text',
 'me',
 'anymore',
 '.',
 'I',
 'have',
 'nothing',
 'else',
 'to',
 'say',
 '.']

In [10]:
# Get the English stopwords list
stopwords_list = stopwords.words('english')

# Print the stopwords list
print(stopwords_list)

print(string.punctuation)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
# remove punctuations and stop words
stop_words = stopwords.words('english') + list(string.punctuation)


In [12]:
#word lowering , stop word and punctuation removal, number removal
def filter_tokens(tokens):
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    filtered_tokens=[token for token in filtered_tokens if token.isalpha()]
    return filtered_tokens

In [13]:
df["message_preprocess"]=df['message_preprocess'].apply(lambda x: filter_tokens(x))
df["message_preprocess"].head()

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, wkly, comp, win, fa, cup, final,...
3        [u, dun, say, early, hor, u, c, already, say]
4       [nah, think, goes, usf, lives, around, though]
Name: message_preprocess, dtype: object

In [14]:
df["message_preprocess"].loc[1100]

['ne',
 'thing',
 'interesting',
 'good',
 'birthday',
 'u',
 'wrking',
 'nxt',
 'started',
 'uni',
 'today']

In [27]:
#stemming vs lemmatization
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stemming(tokens):

  tokens = [stemmer.stem(token) for token in tokens]
  return tokens

In [28]:
# df["message_preprocess_new"]=df['message_preprocess'].apply(lambda x: stemming(x))
# df["message_preprocess_new"].head()

0    [go, jurong, point, crazi, avail, bugi, n, gre...
1                         [ok, lar, joke, wif, u, oni]
2    [free, entri, wkli, comp, win, fa, cup, final,...
3        [u, dun, say, earli, hor, u, c, alreadi, say]
4         [nah, think, goe, usf, live, around, though]
Name: message_preprocess_new, dtype: object

In [29]:
tokens =["ok", "lar", "jokinging", "wif", "u", "oni","joining"]
new=stemming(tokens)
print(new)

['ok', 'lar', 'joking', 'wif', 'u', 'oni', 'join']


In [31]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatization(tokens):
    # Lemmatization with POS tagging
    lemmatizer = WordNetLemmatizer()
    lem_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lem_tokens

# Example usage
tokens = ["running", "cars", "swimming"]
lemmatized_tokens = lemmatization(tokens)
print(lemmatized_tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['run', 'car', 'swim']


In [32]:
df["message_preprocess"]=df['message_preprocess'].apply(lambda x: stemming(x))
df["message_preprocess"].head()

0    [go, jurong, point, crazi, avail, bugi, n, gre...
1                         [ok, lar, joke, wif, u, oni]
2    [free, entri, wkli, comp, win, fa, cup, final,...
3        [u, dun, say, earli, hor, u, c, alreadi, say]
4         [nah, think, goe, usf, live, around, though]
Name: message_preprocess, dtype: object