In [9]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import nltk
import json
import pandas as pd
import re
import emoji
import contractions
import openpyxl
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from pprint import pprint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\canti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Dataset

In [10]:
df = pd.read_csv("craw_pln123_new.csv")
df = df[(df['username'] != 'pln_123')]
df

Unnamed: 0,date,username,tweet
0,2022-07-01 06:59:09,rahinahu461,"@pln_123 Mohon maaf, sy tidak mencantumkan per..."
1,2022-07-01 06:55:51,nandosbs_,@pln_123 Tindaklanjuti hanya dengan tulisan ka...
2,2022-07-01 06:54:10,BennyKeef,"@pln_123 Min listrik di rmh ada masalah, saya ..."
3,2022-07-01 06:36:42,aleendy_,@pln_123 halo min boleh cek DM?
5,2022-07-01 06:25:03,RudySigitPurwa1,@pln_123 Mudah mudahan 24 jam kedepan dan sete...
...,...,...,...
300,2022-06-30 14:02:04,UdinSykes,"@pln_123 daerah balekambang, kramat jati, jaka..."
304,2022-06-30 13:54:13,ruangkata3,@pln_123 2200 VA berarti masuknya ke R1 kan? D...
309,2022-06-30 13:46:30,Dwikodarmawan,"Assalamualaikum @pln_123 , kira kira sampe kap..."
310,2022-06-30 13:45:05,azeezy84,"@pln_123 listrik area Salam, Magelang mati dar..."


## Preprocessing

### Case Folding & Cleansing text

In [11]:
twitter_handle = r'@(\w+)'                        # remove twitter handle (@username)
url_handle = r'http[^ ]+'                                  # remove website URLs that start with 'https?://'
combined_handle = r'|'.join((twitter_handle, url_handle))  # join
www_handle = '(http|https):\/\/\S+'                        # remove website URLs that start with 'www.'
punctuation_handle = r'\W+'
hastag_handle = r"#(\w+)"                                  # remove hastags
number_handle = '[0-9]'

def case_folding(text):
    ## Case Folding
    lowered_text = text.lower()
    ## remove twitter handle (@username)
    t1 = re.sub(twitter_handle, '', lowered_text)
    ## remove hastags
    t2 = re.sub(hastag_handle, '', t1)
    ## remove emoji
    t3 = emoji.replace_emoji(t2)
    ## remove website URLs that start with 'www.'
    t4 = re.sub(www_handle, '', t3)
    ## Punctuation Removal
    t5 = re.sub(punctuation_handle, ' ', t4)
    ## remove number 
    t6 = re.sub(number_handle, '', t5)
    ## replace contractions with their extended forms
    t7 = contractions.fix(t6)
    return t7


### Slang Word

In [12]:
text = open("combined_slang_words.txt", "r",encoding="utf-8")
slang_words = text.read();text.close()
slang_words = json.loads(slang_words)

def slang_word(text):
    # split the words based on whitespace
    sentence_list = word_tokenize(text)

    for i,t in enumerate(sentence_list):
        if t in slang_words.keys():
            sentence_list[i] = slang_words[t] 
    
    text = ' '.join(sentence_list)
    return text

### Stopword

In [1]:
def stopword(text):
    stop_words = stopwords.words('indonesian')
    list_negasi = ['tidak', 'jangan', 'enggak', 'belum', 'blm']
    stop_words = [word for word in stop_words if word not in list_negasi]
    text = word_tokenize(text)
    text = [w for w in text if not w in stop_words]
    sentence = ' '.join(text)
    return sentence

print(stopword("yang di rumah yang tidak jangan disitu"))

NameError: name 'stopwords' is not defined

### Stemming

In [24]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(text):
    stemmed_text = stemmer.stem(text)
    return stemmed_text

### Tokenize

In [25]:
def tokenize(text):
    return word_tokenize(text)

## Apply Preprocessing to Data

In [26]:
result_list = []

for i, text in enumerate (df['tweet']):
    case_folding_text = case_folding(text)
    slang_word_text = slang_word(case_folding_text)
    stopword_text = stopword(slang_word_text)
    stemmed_text = stemming(stopword_text)
    tokenize_text = tokenize(stemmed_text)
    
    
    result_list.append([case_folding_text,
                        slang_word_text,
                        stopword_text,
                        stemmed_text,
                        tokenize_text])

case_folding_texts = []
slang_word_texts = []
stopword_texts = []
stemmed_texts = []
tokenize_texts = []

for i,text in enumerate(result_list):
    for k,word in enumerate(text):
        if k == 0:
            case_folding_texts.append(word)
        if k == 1:
            slang_word_texts.append(word)
        if k == 2:
            stopword_texts.append(word)
        if k == 3:
            stemmed_texts.append(word)
        if k == 4:
            tokenize_texts.append(word)

    
table_data = pd.DataFrame({
    'username' : df['username'],
    'tweet' : df['tweet'],
    'Case Folding': case_folding_texts,
    'Slang Word': slang_word_texts,
    'Stopword': stopword_texts,
    'Stemmed Text': stemmed_texts,
    'Tokenize Text': tokenize_texts
}) 

table_data.to_csv('preprocessing-final.csv',index=False)
table_data.to_excel('preprocessing-final.xlsx',index=False)

table_data

Unnamed: 0,username,tweet,Case Folding,Slang Word,Stopword,Stemmed Text,Tokenize Text
0,rahinahu461,"@pln_123 Mohon maaf, sy tidak mencantumkan per...",mohon maaf sy tidak mencantumkan permasalahan...,mohon maaf saya tidak mencantumkan permasalaha...,mohon maaf tidak mencantumkan permasalahan tuj...,mohon maaf tidak cantum masalah tuju lapor dug...,"[mohon, maaf, tidak, cantum, masalah, tuju, la..."
1,nandosbs_,@pln_123 Tindaklanjuti hanya dengan tulisan ka...,tindaklanjuti hanya dengan tulisan kan bukan ...,tindaklanjuti hanya dengan tulisan kan bukan d...,tindaklanjuti tulisan aksi perubahan komplain ...,tindaklanjuti tulis aksi ubah komplain jam dir...,"[tindaklanjuti, tulis, aksi, ubah, komplain, j..."
2,BennyKeef,"@pln_123 Min listrik di rmh ada masalah, saya ...",min listrik di rmh ada masalah saya coba hubu...,admin listrik di rumah ada masalah saya coba h...,admin listrik rumah coba hubungi pln terdekat ...,admin listrik rumah coba hubung pln dekat tida...,"[admin, listrik, rumah, coba, hubung, pln, dek..."
3,aleendy_,@pln_123 halo min boleh cek DM?,halo min boleh cek dm,halo admin boleh cek dm,halo admin cek dm,halo admin cek dm,"[halo, admin, cek, dm]"
5,RudySigitPurwa1,@pln_123 Mudah mudahan 24 jam kedepan dan sete...,mudah mudahan jam kedepan dan seterusnya ga ...,mudah mudahan jam kedepan dan seterusnya tidak...,mudah mudahan jam kedepan tidak pemadaman,mudah mudah jam depan tidak madam,"[mudah, mudah, jam, depan, tidak, madam]"
...,...,...,...,...,...,...,...
300,UdinSykes,"@pln_123 daerah balekambang, kramat jati, jaka...",daerah balekambang kramat jati jakarta timur ...,daerah balekambang kramat jati jakarta timur s...,daerah balekambang kramat jati jakarta timur p...,daerah balekambang kramat jati jakarta timur p...,"[daerah, balekambang, kramat, jati, jakarta, t..."
304,ruangkata3,@pln_123 2200 VA berarti masuknya ke R1 kan? D...,va berarti masuknya ke r kan daya saya va d...,va berarti masuknya ke r kan daya saya va dan ...,va masuknya r daya va mengalami kenaikan tidak,va masuk r daya va alami naik tidak,"[va, masuk, r, daya, va, alami, naik, tidak]"
309,Dwikodarmawan,"Assalamualaikum @pln_123 , kira kira sampe kap...",assalamualaikum kira kira sampe kapan listrikn...,assalamualaikum kira kira sampai kapan listrik...,assalamualaikum listriknya nyala daerah baleka...,assalamualaikum listrik nyala daerah balekamba...,"[assalamualaikum, listrik, nyala, daerah, bale..."
310,azeezy84,"@pln_123 listrik area Salam, Magelang mati dar...",listrik area salam magelang mati dari pukul ...,listrik area salam magelang mati dari pukul an...,listrik area salam magelang mati an pagi siang...,listrik area salam magelang mati an pagi siang...,"[listrik, area, salam, magelang, mati, an, pag..."
