<a href="https://colab.research.google.com/github/allexanderprastya/PROJECT-TA-JURNAL/blob/main/pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Import Library

In [1]:
!pip install swifter
!pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import swifter
import regex as re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

###Data Cleansing

In [3]:
df = pd.read_csv('/content/review_15k_label.csv')

In [4]:
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['content'] = df['content'].swifter.apply(word_tokenize_wrapper)

Pandas Apply:   0%|          | 0/15000 [00:00<?, ?it/s]

In [5]:
df

Unnamed: 0,content,sentiment
0,"[Bangke, gw, turun, nyangkut, kenok, langsung,...",positive
1,"[Manga, pa, pubg, selalu, loding, ketika, logi...",negative
2,"[Bag, nya, dong, developer, perbaiki, semua, n...",positive
3,"[Emang, berat, putaran, kenapa, gak, seperti, ...",neutral
4,"[P, :, v]",negative
...,...,...
14995,"[Game, ini, sangat, burik, dan, sangat, hd, de...",positive
14996,[Lemot],neutral
14997,"[Bagus, banget]",positive
14998,"[Game, seru, ,, keren, ,, hd, ;, tapi, banyak,...",neutral


In [6]:
def cleansing(text):
    # Mengubah tipe data menjadi string
    text = str(text)
    #hapus non ASCII (hapus emoticon, chinese word, dll)
    text = text.encode("ascii", "replace").decode("ascii")    
    #menghapus number
    text = re.sub(r"\d+", " ", text) 
    # Menghapus Link Dengan Pattern http/https dan www
    text = re.sub(r"http\S+", " ", text)
    text = re.sub("(@\w+|#\w+)", " ", text)
    # Menghapus Tag HTML
    text = re.sub("<.*?>", " ", text)
    # Menghapus Tanda Baca Seperti Titik Dan Koma
    text = text.translate(str.maketrans(" ", " ", string.punctuation))
    # Menghapus Karakter Selain Huruf a-z dan A-Z
    text = re.sub("[^a-zA-Z]", " ", text)
    # Mengganti baris baru (enter) dengan spasi
    text = text.replace("\t"," ").replace("\n"," ").replace("\\"," ")
    # Menghapus Karakter Berulang (Contoh: Horeeee!!!! menjadi Hore!)
    text = re.sub(r"(\w)(\1{2,})", r"\1 ", text)
    text = re.sub(r"\b[a-zA-Z]\b"," ",text)
    text = text.strip()
    # Menghapus Spasi Yang Lebih Dari Satu
    text = re.sub("\s+", " ",text)
    # Mengubah setiap kata menjadi lowercase
    return text.lower()

In [7]:
df["content"] = df["content"].swifter.apply(cleansing)

Pandas Apply:   0%|          | 0/15000 [00:00<?, ?it/s]

In [8]:
df

Unnamed: 0,content,sentiment
0,bangke gw turun nyangkut kenok langsung tolong...,positive
1,manga pa pubg selalu loding ketika login padah...,negative
2,bag nya dong developer perbaiki semua ngeluh k...,positive
3,emang berat putaran kenapa gak seperti biasany...,neutral
4,,negative
...,...,...
14995,game ini sangat burik dan sangat hd dengan keb...,positive
14996,lemot,neutral
14997,bagus banget,positive
14998,game seru keren hd tapi banyak bug pemakaian m...,neutral


###Slang Words

In [9]:
normalized_word = pd.read_csv("/content/new_kamus_alay_fix.csv", sep=";", header=None)

In [10]:
normalized_word

Unnamed: 0,0,1
0,aaau,
1,aadfdd,
2,aaj,
3,aal,
4,aapa,apa
...,...,...
7269,zombi,zombie
7270,zone,zona
7271,zonk,
7272,zz,


In [11]:
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['content'] = df['content'].swifter.apply(word_tokenize_wrapper)

Pandas Apply:   0%|          | 0/15000 [00:00<?, ?it/s]

In [12]:
normalized_word_dict = {}

for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

In [13]:
df['content'] = df['content'].swifter.apply(normalized_term) 

Pandas Apply:   0%|          | 0/15000 [00:00<?, ?it/s]

In [14]:
df['content'] = [' '.join(map(str, l)) for l in df['content']]

In [15]:
df

Unnamed: 0,content,sentiment
0,bangkai aku turun terjebak knock langsung tolo...,positive
1,nan nan pubg selalu memuat ketika login padaha...,negative
2,bug nan dong pengembang perbaiki semua mengelu...,positive
3,memang berat putaran kenapa tidak seperti bias...,neutral
4,,negative
...,...,...
14995,game ini sangat grafis buruk dan sangat high d...,positive
14996,lambat,neutral
14997,bagus banget,positive
14998,game seru keren high definition tapi banyak bu...,neutral


###Stopwords Removal

In [16]:
stopwords_id = set(stopwords.words('indonesian'))
stopwords_en = set(stopwords.words('english'))
stopwords_list = {"nan"}

In [17]:
def stopwords_removal(text):
    text = word_tokenize(text)
    text = [word for word in text if word not in stopwords_id]
    return text

df['content'] = df['content'].swifter.apply(stopwords_removal)

Pandas Apply:   0%|          | 0/15000 [00:00<?, ?it/s]

In [18]:
def stopwords_removal_2(text):
    text = [word for word in text if word not in stopwords_en]
    return text

df['content'] = df['content'].swifter.apply(stopwords_removal_2)

Pandas Apply:   0%|          | 0/15000 [00:00<?, ?it/s]

In [19]:
def stopwords_removal_3(text):
    text = [word for word in text if word not in stopwords_list]
    return text

df['content'] = df['content'].swifter.apply(stopwords_removal_3)

Pandas Apply:   0%|          | 0/15000 [00:00<?, ?it/s]

In [20]:
df

Unnamed: 0,content,sentiment
0,"[bangkai, turun, terjebak, knock, langsung, to...",positive
1,"[pubg, memuat, login, kuota, beli]",negative
2,"[bug, pengembang, perbaiki, mengeluh, bug, lag]",positive
3,"[berat, putaran, enak, mainnya, bagus]",neutral
4,[],negative
...,...,...
14995,"[game, grafis, buruk, high, definition, grafis...",positive
14996,[lambat],neutral
14997,"[bagus, banget]",positive
14998,"[game, seru, keren, high, definition, bug, pem...",neutral


###Stemming

In [21]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# create stemmer
def stemmed_wrapper(term):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(term)

term_dict = {}

for document in df['content']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['content'] = df['content'].swifter.apply(get_stemmed_term)

3708
------------------------
bangkai : bangkai
turun : turun
terjebak : jebak
knock : knock
langsung : langsung
tolong : tolong
tencent : tencent
salah : salah
pubg : pubg
memuat : muat
login : login
kuota : kuota
beli : beli
bug : bug
pengembang : kembang
perbaiki : baik
mengeluh : keluh
lag : lag
berat : berat
putaran : putar
enak : enak
mainnya : main
bagus : bagus
game : game
sayang : sayang
sinyal : sinyal
play : play
sulit : sulit
main : main
mati : mati
grafis : grafis
terputus : putus
putus : putus
mengunduh : unduh
peta : peta
perbarui : baru
kouta : kouta
boros : boros
lambat : lambat
aneh : aneh
kesadaran : sadar
memperbaiki : baik
kalah : kalah
samping : samping
ya : ya
versi : versi
habis : habis
buka : buka
suruh : suruh
puas : puas
tahan : tahan
perangkat : perangkat
pisau : pisau
karambit : karambit
perbaruinya : baru
kecewa : kecewa
kasih : kasih
bintang : bintang
terjun : terjun
mendarat : darat
memegang : pegang
tali : tali
payung : payung
tanah : tanah
senjata : se

Pandas Apply:   0%|          | 0/15000 [00:00<?, ?it/s]

In [22]:
def toSentence(text): # Convert list of words into sentence
    text = ' '.join(word for word in text)
    return text 

In [23]:
df['content'] = df['content'].swifter.apply(toSentence)

Pandas Apply:   0%|          | 0/15000 [00:00<?, ?it/s]

In [24]:
df

Unnamed: 0,content,sentiment
0,bangkai turun jebak knock langsung tolong tenc...,positive
1,pubg muat login kuota beli,negative
2,bug kembang baik keluh bug lag,positive
3,berat putar enak main bagus,neutral
4,,negative
...,...,...
14995,game grafis buruk high definition grafis buruk...,positive
14996,lambat,neutral
14997,bagus banget,positive
14998,game seru keren high definition bug pakai memo...,neutral


###Save Csv 

In [25]:
df.to_csv('review_15k_fix.csv',  encoding='utf-8', index=None, header=True)

In [27]:
df = pd.read_csv('/content/review_15k_fix.csv')

In [28]:
df

Unnamed: 0,content,sentiment
0,bangkai turun jebak knock langsung tolong tenc...,positive
1,pubg muat login kuota beli,negative
2,bug kembang baik keluh bug lag,positive
3,berat putar enak main bagus,neutral
4,,negative
...,...,...
14995,game grafis buruk high definition grafis buruk...,positive
14996,lambat,neutral
14997,bagus banget,positive
14998,game seru keren high definition bug pakai memo...,neutral


In [31]:
df.reset_index(inplace=True)

In [32]:
df.isna().sum() 

index          0
content      857
sentiment      0
dtype: int64

In [33]:
df.drop('index',axis=1,inplace=True)

In [34]:
df2 = df.dropna()
df2.isna().sum()

content      0
sentiment    0
dtype: int64

In [35]:
df2

Unnamed: 0,content,sentiment
0,bangkai turun jebak knock langsung tolong tenc...,positive
1,pubg muat login kuota beli,negative
2,bug kembang baik keluh bug lag,positive
3,berat putar enak main bagus,neutral
5,game bagus sayang sinyal turun game play sulit...,positive
...,...,...
14995,game grafis buruk high definition grafis buruk...,positive
14996,lambat,neutral
14997,bagus banget,positive
14998,game seru keren high definition bug pakai memo...,neutral


In [36]:
df2.to_csv('review_15k_stem_lematiz.csv', encoding='utf-8', index=None, header=True)