In [1]:
import pandas as pd 
import numpy as np

TWEET_DATA = pd.read_csv("Result_Judul_Pustaka_Text_Preprocessing.csv")

TWEET_DATA.head()

Unnamed: 0,no,judul
0,3,usaha kerajinan gerabah untuk meningkatakan pe...
1,3,peranan industri gerabah keramik sebagai suatu...
2,3,industri kerajinan gerabah dan peranannya seba...
3,3,peranan industri kecil warangka keris dalam me...
4,3,bimbingan agama islam terhadap remaja dalam ke...


In [2]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['judul'] = TWEET_DATA['judul'].str.lower()


print('Case Folding Result : \n')
print(TWEET_DATA['judul'].head(5))
print('\n\n\n')

Case Folding Result : 

0    usaha kerajinan gerabah untuk meningkatakan pe...
1    peranan industri gerabah keramik sebagai suatu...
2    industri kerajinan gerabah dan peranannya seba...
3    peranan industri kecil warangka keris dalam me...
4    bimbingan agama islam terhadap remaja dalam ke...
Name: judul, dtype: object






In [3]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = str(text).replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
TWEET_DATA['judul'] = TWEET_DATA['judul'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

TWEET_DATA['judul'] = TWEET_DATA['judul'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['judul'] = TWEET_DATA['judul'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

TWEET_DATA['judul'] = TWEET_DATA['judul'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

TWEET_DATA['judul'] = TWEET_DATA['judul'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['judul'] = TWEET_DATA['judul'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

TWEET_DATA['judul_tokens'] = TWEET_DATA['judul'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(TWEET_DATA['judul_tokens'].head())
print('\n\n\n')

Tokenizing Result : 

0    [usaha, kerajinan, gerabah, untuk, meningkatak...
1    [peranan, industri, gerabah, keramik, sebagai,...
2    [industri, kerajinan, gerabah, dan, peranannya...
3    [peranan, industri, kecil, warangka, keris, da...
4    [bimbingan, agama, islam, terhadap, remaja, da...
Name: judul_tokens, dtype: object






In [4]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

TWEET_DATA['judul_tokens_fdist'] = TWEET_DATA['judul_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(TWEET_DATA['judul_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(usaha, 1), (kerajinan, 1), (gerabah, 1), (un...
1    [(peranan, 1), (industri, 1), (gerabah, 1), (k...
2    [(industri, 1), (kerajinan, 1), (gerabah, 1), ...
3    [(peranan, 1), (industri, 1), (kecil, 1), (war...
4    [(bimbingan, 1), (agama, 1), (islam, 1), (terh...
Name: judul_tokens_fdist, dtype: object


In [9]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords-id.csv", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

TWEET_DATA['judul_tokens_WSW'] = TWEET_DATA['judul_tokens'].apply(stopwords_removal) 


print(TWEET_DATA['judul_tokens_WSW'].head())

0    [usaha, kerajinan, gerabah, meningkatakan, pen...
1    [peranan, industri, gerabah, keramik, sisi, da...
2    [industri, kerajinan, gerabah, peranannya, sum...
3    [peranan, industri, warangka, keris, meningkat...
4    [bimbingan, agama, islam, remaja, keluarga, mu...
Name: judul_tokens_WSW, dtype: object


In [11]:
pip install swifter

Collecting swifter
  Downloading swifter-1.1.2.tar.gz (633 kB)
Collecting psutil>=5.6.6
  Downloading psutil-5.9.0-cp39-cp39-win_amd64.whl (245 kB)
Collecting dask[dataframe]>=2.10.0
  Downloading dask-2022.2.0-py3-none-any.whl (1.1 MB)
Collecting ipywidgets>=7.0.0Note: you may need to restart the kernel to use updated packages.
  Downloading ipywidgets-7.6.5-py2.py3-none-any.whl (121 kB)
Collecting cloudpickle>=0.2.2
  Downloading cloudpickle-2.0.0-py3-none-any.whl (25 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)
Collecting toolz>=0.8.2
  Downloading toolz-0.11.2-py3-none-any.whl (55 kB)
Collecting pyyaml>=5.3.1
  Downloading PyYAML-6.0-cp39-cp39-win_amd64.whl (151 kB)

Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)


You should consider upgrading via the 'C:\Users\ACER\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


Collecting jupyterlab-widgets>=1.0.0
  Downloading jupyterlab_widgets-1.0.2-py3-none-any.whl (243 kB)
Collecting widgetsnbextension~=3.5.0
  Downloading widgetsnbextension-3.5.2-py2.py3-none-any.whl (1.6 MB)
Collecting locket
  Downloading locket-0.2.1-py2.py3-none-any.whl (4.1 kB)
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py): started
  Building wheel for swifter (setup.py): finished with status 'done'
  Created wheel for swifter: filename=swifter-1.1.2-py3-none-any.whl size=13210 sha256=1dc6f1eddd99ce1af1c108ce9f2a212fdbde9144fe1608feff8b4201f6fe5aab
  Stored in directory: c:\users\acer\appdata\local\pip\cache\wheels\9a\0f\37\510c65f041d0aa1a38dc53c67d46c0dbee660816f0c9d1ad17
Successfully built swifter
Installing collected packages: toolz, locket, pyyaml, partd, fsspec, cloudpickle, widgetsnbextension, jupyterlab-widgets, dask, psutil, ipywidgets, swifter
Successfully installed cloudpickle-2.0.0 dask-2022.2.0 fsspec-2022.1.0 ipywidgets-7.6.5 

In [12]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

In [13]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in TWEET_DATA['judul_tokens_WSW']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

TWEET_DATA['judul_tokens_stemmed'] = TWEET_DATA['judul_tokens_WSW'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['judul_tokens_stemmed'])

25392
------------------------
usaha : usaha
kerajinan : rajin
gerabah : gerabah
meningkatakan : meningkatakan
pendapatan : dapat
masyarakat : masyarakat
desa : desa
kasongan : kasongan
kalurahan : kalurahan
bangunjiwo : bangunjiwo
kecamatan : camat
kasihan : kasihan
kabupaten : kabupaten
bantul : bantul
daerah : daerah
istimewa : istimewa
yogyakarta : yogyakarta
peranan : peran
industri : industri
keramik : keramik
sisi : sisi
daya : daya
tarik : tarik
pengembangan : kembang
pariwisata : pariwisata
diy : diy
peranannya : peran
sumber : sumber
rumah : rumah
tangga : tangga
warangka : warangka
keris : keris
meningkatkan : tingkat
bimbingan : bimbing
agama : agama
islam : islam
remaja : remaja
keluarga : keluarga
muslim : muslim
panajang : panajang
rejo : rejo
pundong : pundong
hubungan : hubung
lokasi : lokasi
penggembalaan : gembala
kejadian : jadi
infeksi : infeksi
cacing : cacing
sapi : sapi
siti : siti
mulyo : mulyo
piyungan : piyungan
pelaksanaan : laksana
ibadah : ibadah
buruh : b

Pandas Apply:   0%|          | 0/40136 [00:00<?, ?it/s]

0        [usaha, rajin, gerabah, meningkatakan, dapat, ...
1        [peran, industri, gerabah, keramik, sisi, daya...
2        [industri, rajin, gerabah, peran, sumber, dapa...
3        [peran, industri, warangka, keris, tingkat, da...
4        [bimbing, agama, islam, remaja, keluarga, musl...
                               ...                        
40131                               [angel, in, the, rain]
40132                                 [the, hunger, games]
40133                              [marrying, mr, perfect]
40134                        [bad, boy, for, little, girl]
40135    [tinjau, yuridis, laksana, program, jamin, seh...
Name: judul_tokens_stemmed, Length: 40136, dtype: object


In [14]:
TWEET_DATA.to_csv("Text_Preprocessing.csv")

In [15]:
TWEET_DATA.to_excel("Text_Preprocessing.xlsx")

In [None]:
# Source 
# https://yunusmuhammad007.medium.com/text-preprocessing-menggunakan-pandas-nltk-dan-sastrawi-untuk-large-dataset-5fb3c0a88571