In [73]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
df = pd.read_csv('sms_spam_collection.csv')
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Text Preprocessing 1

In [26]:
# Preprocessing 1: Lowercase

df['message'] = df['message'].apply(lambda x: x.lower())
df['message']

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                 will ü b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
Name: message, Length: 5572, dtype: object

In [27]:
# Preprocessing 2: Remove Contraction

contractions_dict ={     
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i had",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "iit will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that had",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they had",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [28]:
contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())), flags=re.IGNORECASE | re.DOTALL)

In [29]:
text_contoh = "i'm going to try for 2 months ha ha only joking don't"

print(contractions_pattern.sub('test', text_contoh))

test going to try for 2 months ha ha only joking test


In [30]:
def contraction_replacement(input_text):
    temp_contractions_extracted = contractions_pattern.findall(string=input_text) #melakukan pencarian untuk contractions/singkatan
    if len(temp_contractions_extracted)>0:
        for contractions in temp_contractions_extracted: #untuk setiap singkatan yang ditemukan, akan di replace
            if contractions in contractions_dict.keys(): #kalau kita punya penggantinya, akan kita replace.
                return(contractions_pattern.sub(contractions_dict[contractions], input_text)) #prosedur penggantian
            else:
                return input_text
    else:
        return input_text

In [32]:
df['message'] = df['message'].apply(contraction_replacement)
df

Unnamed: 0,label,message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i do not think he goes to usf, he lives ar..."
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ü b going to esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s..."
5570,ham,the guy did some bitching but i acted like i h...


In [34]:
# Preprocessing 3: Remove or convert number into text

text_contoh = 'Saya lahir di tahun 1997'
print(re.sub(pattern = r'[^a-zA-Z ]', repl='', string=text_contoh))

Saya lahir di tahun 


In [37]:
# Preprocessing 4: Remove punctuation, remove white spaces

df['message'] = df['message'].apply(lambda x: re.sub(pattern= r'[^a-zA-Z ]', repl='', string=x))
df['message'] = df['message'].apply(lambda x: re.sub(pattern= r'\s{2,}', repl=' ', string=x))
df['message'] = df['message'].apply(lambda x: x.strip())
df

Unnamed: 0,label,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i do not think he goes to usf he lives aro...
...,...,...
5567,spam,this is the nd time we have tried contact u u ...
5568,ham,will b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggestions
5570,ham,the guy did some bitching but i acted like i h...


In [39]:
# Preprocessing 5: Rremove stpo words and particular words

nltk.download('stopwords')
nltk.download('punkt_tab')

def remove_stopwords(sentence):
    stop_words = stopwords.words('english')
    return ' '.join([w for w in nltk.word_tokenize(sentence) if not w in stop_words])

In [42]:
df['message'] = df['message'].apply(lambda x: remove_stopwords(x))
df

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think goes usf lives around though
...,...,...
5567,spam,nd time tried contact u u pound prize claim ea...
5568,ham,b going esplanade fr home
5569,ham,pity mood soany suggestions
5570,ham,guy bitching acted like interested buying some...


Text Preprocessing 2

In [45]:
stemmer = SnowballStemmer(language='english', ignore_stopwords=True) # untuk mengganti ke kata bakunya

In [61]:
text_contoh = 'care caring cares cared careful daring successfully goes going went gone went'
print(stemmer.stem(text_contoh))

care caring cares cared careful daring successfully goes going went gone w


In [62]:
temp_text_contoh = text_contoh.split(sep=" ")
print(temp_text_contoh)

text_hasil_stem = " ".join([stemmer.stem(x) for x in temp_text_contoh])
print(text_hasil_stem)

['care', 'caring', 'cares', 'cared', 'careful', 'daring', 'successfully', 'goes', 'going', 'went', 'gone', 'went']
care care care care care dare success goe go went gone went


In [None]:
nltk.download('wordnet')

lematizer = WordNetLemmatizer() # untuk ke kata dasarnya

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alhamfebianrinaldy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [64]:
print(lematizer.lemmatize(text_contoh))

care caring cares cared careful daring successfully goes going went gone went


In [65]:
text_hasil_lemmatizer = " ".join([lematizer.lemmatize(x) for x in temp_text_contoh])
print(text_hasil_lemmatizer)

care caring care cared careful daring successfully go going went gone went


In [68]:
def fungsi_stem(sentence):
    return " ".join([stemmer.stem(x) for x in sentence])

def fungsi_lemmatizer(sentence):
    return " ".join([lematizer.lemmatize(x) for x in sentence])

In [72]:
df['message_stem'] = df['message'].apply(lambda x: fungsi_stem(word_tokenize(x)))
df['message_lematizer'] = df['message'].apply(lambda x: fungsi_lemmatizer(word_tokenize(x)))
df

Unnamed: 0,label,message,message_stem,message_lematizer
0,ham,go jurong point crazy available bugis n great ...,go jurong point crazi avail bugi n great world...,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni,ok lar joke wif u oni,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...,free entri wkli comp win fa cup final tkts st ...,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say,u dun say earli hor u c alreadi say,u dun say early hor u c already say
4,ham,nah think goes usf lives around though,nah think goe usf live around though,nah think go usf life around though
...,...,...,...,...
5567,spam,nd time tried contact u u pound prize claim ea...,nd time tri contact u u pound prize claim easi...,nd time tried contact u u pound prize claim ea...
5568,ham,b going esplanade fr home,b go esplanad fr home,b going esplanade fr home
5569,ham,pity mood soany suggestions,piti mood soani suggest,pity mood soany suggestion
5570,ham,guy bitching acted like interested buying some...,guy bitch act like interest buy someth els nex...,guy bitching acted like interested buying some...


Preprocessing: Bag of words

In [74]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

test_countvectorizer = CountVectorizer(ngram_range=(1,3)).fit(corpus)
test_countvectorizer.get_feature_names_out()

array(['and', 'and this', 'and this is', 'document', 'document is',
       'document is the', 'first', 'first document', 'is', 'is the',
       'is the first', 'is the second', 'is the third', 'is this',
       'is this the', 'one', 'second', 'second document', 'the',
       'the first', 'the first document', 'the second',
       'the second document', 'the third', 'the third one', 'third',
       'third one', 'this', 'this document', 'this document is',
       'this is', 'this is the', 'this the', 'this the first'],
      dtype=object)

In [75]:
hasil_countvectorizer = test_countvectorizer.transform(corpus)
hasil_countvectorizer.toarray()

array([[0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 2, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
        1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1]])

In [78]:
df_countvectorizer = pd.DataFrame(
    data=hasil_countvectorizer.toarray(),
    columns=test_countvectorizer.get_feature_names_out()
)
df_countvectorizer['input'] = corpus
df_countvectorizer

Unnamed: 0,and,and this,and this is,document,document is,document is the,first,first document,is,is the,...,third,third one,this,this document,this document is,this is,this is the,this the,this the first,input
0,0,0,0,1,0,0,1,1,1,1,...,0,0,1,0,0,1,1,0,0,This is the first document.
1,0,0,0,2,1,1,0,0,1,1,...,0,0,1,1,1,0,0,0,0,This document is the second document.
2,1,1,1,0,0,0,0,0,1,1,...,1,1,1,0,0,1,1,0,0,And this is the third one.
3,0,0,0,1,0,0,1,1,1,0,...,0,0,1,0,0,0,0,1,1,Is this the first document?
