# Preparing Data

In [1]:
import pandas as pd 
import numpy as np
tweet = pd.read_excel("tweet.xlsx")

tweet.head()

Unnamed: 0,Text
0,The Mandalika Circuit became a favorite of wor...
1,Mandalika Circuit Receives World Class Racer P...
2,Mandalika Circuit is a favorite of world -clas...
3,"The world -class motor racing event, MotoGP, h..."
4,Mandalika Circuit Receives World Class Racer P...


# Case Folding

In [2]:
# ------ Case Folding --------
# Using Series.str.lower() function on Pandas
tweet['Text Case Folding'] = tweet['Text'].str.lower()


print('Case Folding Result : \n')
print(tweet['Text Case Folding'].head(5))
print('\n\n\n')

Case Folding Result : 

0    the mandalika circuit became a favorite of wor...
1    mandalika circuit receives world class racer p...
2    mandalika circuit is a favorite of world -clas...
3    the world -class motor racing event, motogp, h...
4    mandalika circuit receives world class racer p...
Name: Text Case Folding, dtype: object






# Tokenizing

In [3]:
# ------ Tokenizing ---------

import string 
import re #regex library

# import word_tokenize
from nltk.tokenize import word_tokenize 

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

tweet['Text Case Folding'] = tweet['Text Case Folding'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

tweet['Text Tokenizing'] = tweet['Text Case Folding'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(tweet['Text Tokenizing'].head())
print('\n\n\n')

Tokenizing Result : 

0    [the, mandalika, circuit, became, favorite, of...
1    [mandalika, circuit, receives, world, class, r...
2    [mandalika, circuit, is, favorite, of, world, ...
3    [the, world, class, motor, racing, event, moto...
4    [mandalika, circuit, receives, world, class, r...
Name: Text Tokenizing, dtype: object






# Normalization

In [4]:
# ------ Normalization ---------
normalizad_word = pd.read_excel("kata_baku.xlsx")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

tweet['Text Normalization'] = tweet['Text Tokenizing'].apply(normalized_term)

tweet['Text Normalization'].head(10)

0    [the, mandalika, circuit, became, favorite, of...
1    [mandalika, circuit, receives, world, class, r...
2    [mandalika, circuit, is, favorite, of, world, ...
3    [the, world, class, motor, racing, event, moto...
4    [mandalika, circuit, receives, world, class, r...
5    [in, the, same, boat, marquez, in, mandalika, ...
6    [different, motogp, with, formula, the, motogp...
7    [mandalika, circuit, is, favorite, of, world, ...
8    [mandalika, circuit, is, favorite, racing, are...
9    [ewopang, ombabikinoten, is, clearly, cool, ma...
Name: Text Normalization, dtype: object

# Stemming

In [5]:
# ------ Stemming ---------
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
import swifter


# stemmed
def stemmed_wrapper(term):
    return ps.stem(term)

term_dict = {}

for document in tweet['Text Normalization']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

tweet['Text Stemming'] = tweet['Text Normalization'].swifter.apply(get_stemmed_term)
print(tweet['Text Stemming'])

21661
------------------------
the : the
mandalika : mandalika
circuit : circuit
became : becam
favorite : favorit
of : of
world : world
class : class
racers : racer
united : unit
with : with
criminal : crimin
investigation : investig
receives : receiv
racer : racer
praise : prais
is : is
motor : motor
racing : race
event : event
motogp : motogp
has : ha
been : been
successfully : success
held : held
at : at
after : after
this : thi
what : what
big : big
events : event
will : will
be : be
follow : follow
discussion : discuss
sapaindonesia : sapaindonesia
sweet : sweet
moment : moment
lag : lag
streamine : streamin
in : in
same : same
boat : boat
marquez : marquez
video : video
mick : mick
schumacher : schumach
accident : accid
jeddah : jeddah
training : train
session : session
different : differ
formula : formula
tau : tau
magnificent : magnific
have : have
long : long
for : for
time : time
but : but
if : if
it : it
complicated : complic
suspected : suspect
that : that
there : there
bu

pa : pa
strict : strict
boster : boster
mdhnjava : mdhnjava
indo : indo
bil : bil
arewamine : arewamin
kobkan : kobkan
repaired : repair
adequate : adequ
surprise : surpris
loris : lori
capirossi : capirossi
min : min
binta : binta
islands : island
riau : riau
haphazard : haphazard
ureijiollie : ureijiolli
comes : come
strikes : strike
bless : bless
itenjin : itenjin
zoom : zoom
aelakovalskia : aelakovalskia
yooo : yooo
hander : hander
ik : ik
requirement : requir
lebaran : lebaran
accomplished : accomplish
rionaji : rionaji
rmtroysuryyo : rmtroysuryyo
tries : tri
cook : cook
repeatedly : repeatedli
disposable : dispos
mandalikandalika : mandalikandalika
risk : risk
exists : exist
loss : loss
post : post
footage : footag
exactly : exactli
focus : focu
jawsumator : jawsum
sycophant : sycoph
astra : astra
bigbike : bigbik
mystery : mysteri
simplicity : simplic
promotional : promot
opinion : opinion
personally : person
aya : aya
oekarthho : oekarthho
ifsembiring : ifsembir
rmtrysuryyo : r

clothes : cloth
hats : hat
sporadically : sporad
adorn : adorn
inside : insid
inealaine : inealain
google : googl
understood : understood
ambalangam : ambalangam
rt : rt
furniture : furnitur
kemenpora : kemenpora
vr : vr
identical : ident
izieqdivist : izieqdivist
heard : heard
kls : kl
moyo : moyo
aziz : aziz
yanuar : yanuar
wulandaris : wulandari
pawanghujanmbakrara : pawanghujanmbakrara
dik : dik
klo : klo
widely : wide
season : season
nitip : nitip
enlightenment : enlighten
dana : dana
protest : protest
bolsters : bolster
cpt : cpt
tuwir : tuwir
piaggio : piaggio
sr : sr
scooter : scooter
undergoing : undergo
fp : fp
irsangherman : irsangherman
wisata : wisata
suryana : suryana
loud : loud
dennysuryana : dennysuryana
okezone : okezon
angry : angri
thor : thor
smitten : smitten
aklejauh : aklejauh
rush : rush
worry : worri
stalls : stall
biscuits : biscuit
transport : transport
insarnixo : insarnixo
saying : say
masinmasing : masinmas
forbidden : forbidden
acts : act
discovered : di

owes : owe
limit : limit
idcorner : idcorn
exglenniza : exglenniza
dimandalika : dimandalika
tuha : tuha
ediaindonesia : ediaindonesia
hujango : hujango
lensakabinet : lensakabinet
keeeeenn : keeeeenn
include : includ
ciputat : ciputat
prefers : prefer
iawec : iawec
upergjp : upergjp
bnpb : bnpb
consistent : consist
packages : packag
pikap : pikap
helping : help
correct : correct
paaaak : paaaak
accommodated : accommod
san : san
puanmaharani : puanmaharani
bandits : bandit
blunders : blunder
basically : basic
resistant : resist
collisions : collis
hotmix : hotmix
splendor : splendor
selection : select
dreams : dream
eaten : eaten
underestimated : underestim
emos : emo
iconic : icon
sngaja : sngaja
bt : bt
pstants : pstant
detertice : detertic
hers : her
descendant : descend
nim : nim
siis : sii
mosques : mosqu
fostered : foster
uckhss : uckhss
indonesiahebat : indonesiahebat
resounded : resound
achievers : achiev
lensaberita : lensaberita
dagged : dag
pecco : pecco
vinales : vinal
dete

impatient : impati
buks : buk
opa : opa
ops : op
panti : panti
idodogroho : idodogroho
idhwaskito : idhwaskito
safely : safe
explode : explod
ivogaraldus : ivogaraldu
jamin : jamin
eboralaksmii : eboralaksmii
winds : wind
facilitating : facilit
gabatan : gabatan
gasabar : gasabar
gasa : gasa
drakor : drakor
consulting : consult
gameindozone : gameindozon
riburibut : riburibut
menang : menang
lesbi : lesbi
dangdut : dangdut
tarp : tarp
dragged : drag
sphere : sphere
batbatu : batbatu
kecil : kecil
custom : custom
rasa : rasa
riemrn : riemrn
ripras : ripra
plotted : plot
hampered : hamper
showcase : showcas
gokss : gokss
ondteaaasia : ondteaaasia
motivate : motiv
fanzone : fanzon
bocahkomentato : bocahkomentato
gedubrak : gedubrak
focusing : focus
hamaludinr : hamaludinr
cruel : cruel
differentiate : differenti
tmii : tmii
gifts : gift
prime : prime
datra : datra
datrainsusaa : datrainsusaa
bpks : bpk
pickpocketing : pickpocket
standing : stand
approached : approach
marshalls : marshal
v

damramriindonesia : damramriindonesia
jokowigaspolinfrastructure : jokowigaspolinfrastructur
priandi : priandi
satri : satri
wanda : wanda
indki : indki
satan : satan
alas : ala
offend : offend
atnaintankurn : atnaintankurn
assumptions : assumpt
kompolnas : kompolna
motoran : motoran
parisfashionweek : parisfashionweek
geprekbensu : geprekbensu
msglowtakeoverparis : msglowtakeoverpari
hssme : hssme
ibkadel : ibkadel
vogue : vogu
nights : night
deletes : delet
sequence : sequenc
beemly : beemli
staring : stare
vonedarm : vonedarm
consistently : consist
kasiha : kasiha
deleted : delet
pingingen : pingingen
odjohan : odjohan
yogya : yogya
ambyarrr : ambyarrr
ludes : lude
sewot : sewot
telengleng : telengleng
atuhaticoffee : atuhaticoffe
praldo : praldo
fathun : fathun
estyponi : estyponi
faubing : faub
isniscom : isniscom
jayadi : jayadi
irkuit : irkuit
rumps : rump
unravel : unravel
aseeek : aseeek
anthropfest : anthropfest
odblessrocks : odblessrock
iskorielekta : iskorielekta
omleo : o

keretan : keretan
ngeeng : ngeeng
ghosh : ghosh
bumping : bump
trabas : traba
smpe : smpe
floated : float
ciba : ciba
shuck : shuck
sodial : sodial
hollywood : hollywood
mandorcantik : mandorcantik
bintanghollywood : bintanghollywood
ruoff : ruoff
itrimawagusony : itrimawagusoni
ivalditirta : ivalditirta
enthouzzz : enthouzzz
bagnania : bagnania
banyk : banyk
komplain : komplain
hil : hil
booty : booti
languages : languag
apuk : apuk
uayabanan : uayabanan
umbrellaa : umbrellaa
pelecing : pelec
marquezpernahmakandisini : marquezpernahmakandisini
mgtmtmu : mgtmtmu
ngresiki : ngresiki
ryfirdaus : ryfirdau
blunder : blunder
lvafila : lvafila
mpang : mpang
atzejammer : atzejamm
sebagai : sebagai
saya : saya
malu : malu
dicemooh : dicemooh
dan : dan
apalagi : apalagi
patung : patung
mau : mau
dipajang : dipajang
ampun : ampun
itriiyanr : itriiyanr
physically : physic
sengs : seng
sak : sak
kudune : kudun
threaten : threaten
siloak : siloak
loak : loak
akka : akka
flea : flea
goin : goin
pret

wearpack : wearpack
rudi : rudi
incised : incis
daka : daka
ralion : ralion
utej : utej
sipalingmotogp : sipalingmotogp
gpam : gpam
bombastic : bombast
dalem : dalem
pertamini : pertamini
tambel : tambel
jepara : jepara
twitterjepara : twitterjepara
engaransajak : engaransajak
pracinid : pracinid
videosir : videosir
fingers : finger
fairness : fair
riview : riview
influencers : influenc
jalal : jalal
jimbonk : jimbonk
gfyfpgfcwub : gfyfpgfcwub
particles : particl
binde : bind
seasson : seasson
baeeek : baeeek
komayaupdate : komayaupd
tracer : tracer
ayas : aya
resonancers : resonanc
pulkam : pulkam
migrant : migrant
grouped : group
janitors : janitor
ketemba : ketemba
scorching : scorch
emefess : emefess
motogpadatitiket : motogpadatitiket
nitial : nitial
triputr : triputr
kamseltibcarlantas : kamseltibcarlanta
ranmor : ranmor
hahahhaha : hahahhaha
aso : aso
dega : dega
uketi : uketi
ahdlatululama : ahdlatululama
iyamaufamin : iyamaufamin
otajajarut : otajajarut
flows : flow
mandaltest

Pandas Apply:   0%|          | 0/21687 [00:00<?, ?it/s]

0        [the, mandalika, circuit, becam, favorit, of, ...
1        [mandalika, circuit, receiv, world, class, rac...
2        [mandalika, circuit, is, favorit, of, world, c...
3        [the, world, class, motor, race, event, motogp...
4        [mandalika, circuit, receiv, world, class, rac...
                               ...                        
21682    [the, mandalika, circuit, will, host, the, gt,...
21683    [the, mandalika, circuit, will, hold, the, gt,...
21684    [he, use, the, same, motorbik, while, tri, out...
21685    [the, statu, of, pak, de, will, be, instal, at...
21686    [acceler, the, construct, of, the, mandalika, ...
Name: Text Stemming, Length: 21687, dtype: object


# Filtering

In [6]:
# ------ Filtering ---------
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword
list_stopwords = stopwords.words('english')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(['the', 'Aspiyuaja', 'ewopang', 'ombabikinoten',  'wkwkwk', 'jgdqcxnjo', 'rtqwsxznu', 
                       'pembslap','mnjdi','rt','&amp',
                       'allah', 'brb', 'btw', 'cod', 'cmiiw', 'fyi',
                       'gg', 'ggwp', 'idk', 'ikr', 'lol', 'ootd', 'lmao', 'oot',
                       'pap', 'otw', 'tfl', 'vc', 'ygy'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopword inggris.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

tweet['Text Filtering'] = tweet['Text Stemming'].apply(stopwords_removal) 


print(tweet['Text Filtering'].head())

0    [mandalika, circuit, becam, favorit, world, cl...
1    [mandalika, circuit, receiv, world, class, rac...
2    [mandalika, circuit, favorit, world, class, ra...
3    [world, class, motor, race, event, motogp, ha,...
4    [mandalika, circuit, receiv, world, class, rac...
Name: Text Filtering, dtype: object


# View Results

In [7]:
tweet.head()

Unnamed: 0,Text,Text Case Folding,Text Tokenizing,Text Normalization,Text Stemming,Text Filtering
0,The Mandalika Circuit became a favorite of wor...,the mandalika circuit became favorite of worl...,"[the, mandalika, circuit, became, favorite, of...","[the, mandalika, circuit, became, favorite, of...","[the, mandalika, circuit, becam, favorit, of, ...","[mandalika, circuit, becam, favorit, world, cl..."
1,Mandalika Circuit Receives World Class Racer P...,mandalika circuit receives world class racer p...,"[mandalika, circuit, receives, world, class, r...","[mandalika, circuit, receives, world, class, r...","[mandalika, circuit, receiv, world, class, rac...","[mandalika, circuit, receiv, world, class, rac..."
2,Mandalika Circuit is a favorite of world -clas...,mandalika circuit is favorite of world class ...,"[mandalika, circuit, is, favorite, of, world, ...","[mandalika, circuit, is, favorite, of, world, ...","[mandalika, circuit, is, favorit, of, world, c...","[mandalika, circuit, favorit, world, class, ra..."
3,"The world -class motor racing event, MotoGP, h...",the world class motor racing event motogp has ...,"[the, world, class, motor, racing, event, moto...","[the, world, class, motor, racing, event, moto...","[the, world, class, motor, race, event, motogp...","[world, class, motor, race, event, motogp, ha,..."
4,Mandalika Circuit Receives World Class Racer P...,mandalika circuit receives world class racer p...,"[mandalika, circuit, receives, world, class, r...","[mandalika, circuit, receives, world, class, r...","[mandalika, circuit, receiv, world, class, rac...","[mandalika, circuit, receiv, world, class, rac..."


# Save Results

In [8]:
tweet.to_csv("preprocessing results", index=False)