In [103]:
import pandas as pd
import emoji
import re
from textblob import TextBlob
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import textblob.exceptions
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from deep_translator import GoogleTranslator
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [110]:
def remove_enter(val):
    """menghilangkan \n atau enter"""
    return ' '.join(val.split())

def remove_emoji():
    """menghilangkan emoji"""
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)

begone_emoji = remove_emoji()

def preprocessing_data(val):
    """
    lower huruf, menghilangkan hyperlink, unicode, RT, menghilangkan titik koma ganda,
    menghilangkan enter dan emoji.
    """
    result = val.lower().strip()
    result = re.sub(r'(@|https?)\S+|#[A-Za-z0-9_]+', '', result).replace("&amp;", "dan")
    result = re.sub(r'RT[\s]+','',result)
    result = re.sub('[^.,a-zA-Z0-9 \n\.]', '', result)
    # result = re.sub(r'\d', '', result)
    # result = re.sub(r'[^\w\s]', ' ', result)
    result = re.sub(r'\.+', " . ", result)
    result = re.sub(r'\,+', " , ", result)
    result = remove_enter(result)
    result = begone_emoji.sub(repl='', string=result)
    return result

def final_remove(val):
    """
   fungsi ini digunakan untuk melakukan preprocessing akhir yaitu menghilangkan titk, koma dan karakter selain alfabet.
   setelah bersih akan dilakukan remove stopword dan stemming
    """
    result = re.sub(r'\.+', " . ", val)
    result = re.sub(r'\,+', " , ", result)
    result = re.sub(r'[^\w\s]', ' ', result)
    result = remove_stopword(result)
    result = stemming(result)
    return result

factory = StopWordRemoverFactory()
stopwords_id = factory.get_stop_words()
stopword_id = factory.create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()
def remove_stopword(val):
    """menghilangkan stopword"""
    removed = stopword_id.remove(val)
    return removed

def stemming(val):
    """mengubah sebuah kata menjadi kata dasar"""
    hasil = stemmer.stem(val)
    return hasil

def polarity_decider(val):
    """melihat polaritas menggunakan TextBlob"""
    analize = TextBlob(val)
    processed = analize.translate(from_lang='id', to='en')
    if processed.polarity > 0:
        sentimen = 1
    elif processed.polarity < 0:
        sentimen = -1
    else:
        sentimen = 0
    return sentimen, str(processed).lower()

eng_stemmer = PorterStemmer()
def eng_final_remove(val):
    """melakukan preprocessing akhir pada teks english yaitu menghilangkan karakter non alfabet,
    remove stopword, dan stemming"""
    remove_non_char = re.sub(r'[^\w\s]', ' ', val.lower())
    stopwords_english = stopwords.words('english')
    tokens = word_tokenize(remove_non_char)
    remove_sw = [word for word in tokens if not word in stopwords_english]
    stemming_en = [eng_stemmer.stem(word) for word in remove_sw]
    return  ' '.join(stemming_en)

def translate(value, src = 'id', target = 'en'):
    """ menerjemahkan bahasa indonesia ke inggris secara default"""
    result = GoogleTranslator(source=src, target=target).translate(value)
    return result

def vader_sentiment(sentence):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
    if sentiment_dict['compound'] >= 0.05 :
        value = 1
    elif sentiment_dict['compound'] <= - 0.05 :
        value = -1
    else :
        value = 0
    return value

In [32]:
test = preprocessing_data("kembangkan etle, 7 polda dapat penghargaan di rakernis fungsi gakkum")
test

'kembangkan etle , polda dapat penghargaan di rakernis fungsi gakkum'

In [None]:
from langdetect import detect
text = df[0][0]
cleaned = preprocessing_data(text)
test = detect(cleaned)
print(test)
print(cleaned)


In [36]:
import csv
with open('data oktober-des 2022 etilang tanpa lang dan etle.csv', encoding='utf-8') as csv_data:
    load_data = list(csv.reader(csv_data, delimiter=';'))
    dictionary = {}
    list_of_dictionary = []

    for item in load_data[1:20]:
        # print(item[0])
        # break
        try:
            translate_en = translate(preprocessing_data(item[0]))
            sentiment, translated = polarity_decider(preprocessing_data(item[0]))
            dictionary = {
                'origin_text' : item[0],
                'text_id': final_remove(preprocessing_data(item[0])),
                'origin_text_en': translate_en,
                # 'text_en': eng_final_remove(preprocessing_data(translated)),
                'text_en': eng_final_remove(translate_en),
                'label' : vader_sentiment(translate_en)
            }
            list_of_dictionary.append(dictionary)
        except textblob.exceptions.NotTranslated:
            pass
len(list_of_dictionary)



19

In [37]:
list_of_dictionary

[{'origin_text': 'Kembangkan ETLE, 7 Polda Dapat Penghargaan di Rakernis Fungsi Gakkum https://t.co/FmqJ5SAFLE',
  'text_id': 'kembang etle polda harga rakernis fungsi gakkum',
  'origin_text_en': 'develop ethle, polda can be awarded at the rakernis function gakkum',
  'text_en': 'develop ethl , polda award rakerni function gakkum',
  'label': 0},
 {'origin_text': 'Sosialisasi etle dan app smart city\n\n#polsekrambangtengah https://t.co/d8BUmEZZJ5',
  'text_id': 'sosialisasi etle app smart city',
  'origin_text_en': 'etle socialization and app smart city',
  'text_en': 'etl social app smart citi',
  'label': 1},
 {'origin_text': 'Berita populer kumparanOTO, Selasa (13/12) detail Vespa Batik seharga Rp 77 juta, cara kerja dan jenis pelanggaran tilang ETLE mobile. #kumparanOTO https://t.co/YtXCinS9NK',
  'text_id': 'berita populer kumparanoto selasa detail vespa batik harga rp juta cara kerja jenis langgar tilang etle mobile',
  'origin_text_en': 'popular news kumparanoto, tuesday vespa 

In [None]:
df_clean = pd.DataFrame(list_of_dictionary)
df_clean.head()

In [None]:
df_clean.to_csv('clear_dataset_stemming_stopword.csv', index=False, sep=';')

In [None]:
fields_names = ['text_id', 'text_en','label']
with open('clear_dataset_3.csv', 'w', encoding='utf-8') as csvfile:
    write = csv.DictWriter(csvfile, fieldnames= fields_names)
    write.writeheader()
    write.writerows(list_of_dictionary)

In [None]:
df_dataset_1 = pd.read_csv('clear_dataset.csv')
# df_dataset_2 = pd.read_csv('clear_dataset_2.csv')
# df_dataset_3 = pd.read_csv('clear_dataset_3_fix.csv')
df_dataset_1.head()

In [30]:
text = "just because i can flying does not mean i am a birds roamer fox packing"
testis = eng_final_remove(text)
testis

'fli mean bird roamer fox pack'