In [23]:
import matplotlib.pyplot as plt
import pandas as pd
import emoji
import re
from typing import Any
from textblob import TextBlob
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import textblob.exceptions

In [17]:

data = pd.read_excel('data oktober-des 2022 etilang tanpa lang dan etle.xlsx')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,text_id
0,"Kembangkan ETLE, 7 Polda Dapat Penghargaan di ..."
1,Sosialisasi etle dan app smart city\n\n#polsek...
2,"Berita populer kumparanOTO, Selasa (13/12) det..."
3,"Mimin kasih info ya gaes, kita mau uji coba ET..."
4,personil lalu lintas polres badung melaksanaka...


In [38]:
def remove_enter(val):
    return ' '.join(val.split())

def remove_emoji():
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)

begone_emoji = remove_emoji()

def preprocessing_data(val):
    result = val.lower().strip()
    result = re.sub(r'(@|https?)\S+|#[A-Za-z0-9_]+', '', result).replace("&amp;", "dan")
    result = re.sub(r'RT[\s]+','',result)
    result = re.sub('[^.,a-zA-Z0-9 \n\.]', '', result)
    result = re.sub(r'\d', '', result)
    # result = re.sub(r'[^\w\s]', ' ', result)
    result = re.sub(r'\.+', " . ", result)
    result = re.sub(r'\,+', " , ", result)
    result = remove_enter(result)
    result = begone_emoji.sub(repl='', string=result)
    return result

def final_remove(val):
    result = re.sub(r'\.+', " . ", val)
    result = re.sub(r'\,+', " , ", result)
    result = remove_stopword(result)
    result = stemming(result)
    return result

factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()
stopword = factory.create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()
def remove_stopword(val):
    removed = stopword.remove(val)
    return removed

def stemming(val):
    hasil = stemmer.stem(val)
    return hasil

def polarity_decider(val):
    analize = TextBlob(val)
    processed = analize.translate(from_lang='id', to='en')
    if processed.polarity > 0:
        sentimen = 1
    elif processed.polarity < 0:
        sentimen = -1
    else:
        sentimen = 0
    return sentimen, str(processed).lower()


In [37]:
test = preprocessing_data("kembangkan etle, polda dapat penghargaan di rakernis fungsi gakkum")
test

'kembangkan etle , polda dapat penghargaan di rakernis fungsi gakkum'

In [4]:
from langdetect import detect
text = df[0][0]
cleaned = preprocessing_data(text)
test = detect(cleaned)
print(test)
print(cleaned)


id
kembangkan etle , polda dapat penghargaan di rakernis fungsi gakkum


In [35]:
import csv
with open('data oktober-des 2022 etilang tanpa lang dan etle.csv', encoding='utf-8') as csv_data:
    load_data = list(csv.reader(csv_data, delimiter=';'))
    dictionary = {}
    list_of_dictionary = []

    for item in load_data[1:5]:
        # print(item[0])
        # break
        try:
            sentiment, translated = polarity_decider(preprocessing_data(item[0]))
            dictionary = {
                'origin_text' : item[0],
                'text_id': final_remove(preprocessing_data(item[0])),
                'text_en': translated,
                'label' : sentiment
            }
            list_of_dictionary.append(dictionary)
        except textblob.exceptions.NotTranslated:
            pass
len(list_of_dictionary)



4

In [36]:
list_of_dictionary

[{'origin_text': 'Kembangkan ETLE, 7 Polda Dapat Penghargaan di Rakernis Fungsi Gakkum https://t.co/FmqJ5SAFLE',
  'text_id': 'kembang etle polda harga rakernis fungsi gakkum',
  'text_en': 'develop ethle, polda can be awarded at the rakernis function gakkum',
  'label': 0},
 {'origin_text': 'Sosialisasi etle dan app smart city\n\n#polsekrambangtengah https://t.co/d8BUmEZZJ5',
  'text_id': 'sosialisasi etle app smart city',
  'text_en': 'etle socialization and app smart city',
  'label': 1},
 {'origin_text': 'Berita populer kumparanOTO, Selasa (13/12) detail Vespa Batik seharga Rp 77 juta, cara kerja dan jenis pelanggaran tilang ETLE mobile. #kumparanOTO https://t.co/YtXCinS9NK',
  'text_id': 'berita populer kumparanoto selasa detail vespa batik harga rp juta cara kerja jenis langgar tilang etle mobile',
  'text_en': 'popular news kumparanoto, tuesday vespa batik details for rp',
  'label': 1},
 {'origin_text': 'Mimin kasih info ya gaes, kita mau uji coba ETLE di wilayah Tangerang Kota

In [21]:
df_clean = pd.DataFrame(list_of_dictionary)
df_clean.head()

Unnamed: 0,text_id,text_en,label
0,kembang etle polda harga rakernis fungsi gakkum,kembang etle polda price rakernis gakkum function,0
1,sosialisasi etle app smart city,etle app smart city socialization,1
2,berita populer kumparanoto selasa detail vespa...,popular news kumparanoto tuesday vespa batik d...,1
3,mimin kasih info gaes mau uji coba etle wilaya...,mimin gives info gaes want to test the ethle o...,1
4,personil lalu lintas polres badung laksana teg...,badung police traffic personnel like the human...,0


In [22]:
df_clean.to_csv('clear_dataset_stemming_stopword.csv', index=False, sep=';')

In [6]:
fields_names = ['text_id', 'text_en','label']
with open('clear_dataset_3.csv', 'w', encoding='utf-8') as csvfile:
    write = csv.DictWriter(csvfile, fieldnames= fields_names)
    write.writeheader()
    write.writerows(list_of_dictionary)

In [12]:
df_dataset_1 = pd.read_csv('clear_dataset.csv')
# df_dataset_2 = pd.read_csv('clear_dataset_2.csv')
# df_dataset_3 = pd.read_csv('clear_dataset_3_fix.csv')
df_dataset_1.head()

ParserError: Error tokenizing data. C error: Expected 3 fields in line 313, saw 4
