In [1]:
import numpy as np 
import pandas as pd 
import os
import re
import string
import numpy as np 
import random
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from tqdm import tqdm
import os
import nltk
import spacy
import random

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [4]:
df = pd.read_csv("spam.csv", encoding="latin-1")

df = df.dropna(how="any", axis=1)
df.columns = ['target', 'message']

df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['message_len'] = df['message'].apply(lambda x: len(x.split(' ')))
df.head()

Unnamed: 0,target,message,message_len
0,ham,"Go until jurong point, crazy.. Available only ...",20
1,ham,Ok lar... Joking wif u oni...,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28
3,ham,U dun say so early hor... U c already then say...,11
4,ham,"Nah I don't think he goes to usf, he lives aro...",13


In [6]:
df['message_clean'] = df['message'].apply(clean_text)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,11,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah i dont think he goes to usf he lives aroun...


In [7]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text
    
df['message_clean'] = df['message_clean'].apply(remove_stopwords)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry wkly comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say early hor already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goes usf lives around though


In [8]:
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [9]:
df['message_clean'] = df['message_clean'].apply(stemm_text)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say earli hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though


In [10]:
def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    return text

In [11]:
df['message_clean'] = df['message_clean'].apply(preprocess_data)
df.head()

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say ear hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though


In [12]:
df.head(-5)

Unnamed: 0,target,message,message_len,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",20,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,6,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,11,dun say ear hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah dont think goe usf live around though
...,...,...,...,...
5562,ham,Ok lor... Sony ericsson salesman... I ask shuh...,18,ok lor soni ericsson salesman ask shuhui say q...
5563,ham,Ard 6 like dat lor.,5,ard like dat lor
5564,ham,Why don't you wait 'til at least wednesday to ...,15,dont wait til least wednesday see get
5565,ham,Huh y lei...,3,huh lei


In [13]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Document Expansion

In [14]:
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
def wordnet_doc_expansion(doc,k=3):
    ps = PorterStemmer()
    upd_doc = doc.split()
    synonyms =[]
    res=[w for w in upd_doc]
    for q in upd_doc:
        q_stem=ps.stem(q)
        for syn in wordnet.synsets(q_stem):
            for l in syn.lemmas():
                synonyms.append(l.name())
        synonyms=list(set(synonyms))
        synonyms=synonyms[:k]
        for w in synonyms:
            w_stem=ps.stem(w)
            if  w_stem!=q_stem:
                  res.append(w)
            synonyms=[]
    return ' '.join(res)

In [15]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [16]:
i = 'mean fat head'
upd_doca = ['mean', 'fat', 'head']
resa =[w for w in upd_doca]
resa

['mean', 'fat', 'head']

In [17]:
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('meaning')

'mean'

In [18]:
documents = df['message_clean']
docs_expansion = []
for i in documents:
  docs_expansion.append(wordnet_doc_expansion(i,1))
# docs_expansion = wordnet_doc_expansion(documents,2)

In [19]:
docs_expansion

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat crack direct help northward with_child earthly_concern Pelican_State east sideboard grow Cupid',
 'ok lar joke wif oni all_right laugh Office_of_Naval_Intelligence',
 'free entri wkli comp win fa cup final tkts may text fa receiv entri questionstd txt ratetc appli free_people advance transfuse terminal',
 'dun say ear hor alreadi say grayish_brown pronounce auricle pronounce',
 'nah dont think goe usf live around though imagine endure close_to',
 'freemsg hey darl week word back id like fun still tb ok xxx std chgs send å£ rcv workweek binding same merriment inactive T.B. all_right 30 social_disease direct',
 'even brother like speak treat like aid patent fifty-fifty comrade same verbalize goody same help manifest',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun bespeak go_under telephoner weigh supporter',
 'winner valu network custom select receiv

In [20]:
#Word Embeddings: GloVe

In [21]:
texts = docs_expansion

# GLOVE

In [22]:
!pip install Tokenizer  

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Tokenizer
  Downloading tokenizer-3.4.1-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.3 MB/s 
[?25hInstalling collected packages: Tokenizer
Successfully installed Tokenizer-3.4.1


In [23]:
# Calculate the length of our vocabulary
from keras.preprocessing.text import Tokenizer
word_tokenizer = Tokenizer()

In [24]:
word_tokenizer.fit_on_texts(texts)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

7998

In [25]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
import tensorflow as tf
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)

In [27]:
longest_train = max(texts, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

train_padded_sentences = tf.keras.preprocessing.sequence.pad_sequences(
    embed(texts), 
    length_long_sentence, 
    padding='post'
)

X = train_padded_sentences

In [28]:
print(X)

[[   4 4202  496 ...    0    0    0]
 [  23  444  869 ...    0    0    0]
 [   8  624  944 ...    0    0    0]
 ...
 [7996 1531 7997 ...    0    0    0]
 [ 283 1867  119 ...    0    0    0]
 [2802  636  335 ...    0    0    0]]


# TFIDF
Menghitung TF IDF untuk setiap dokumen dan hasilnya digunakan untuk clustering dokumen

In [29]:
import pandas as pd
import numpy as np

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X)
print(tfidf.shape )                        

(5572, 114)
