In [1]:
import numpy as np
import pandas as pd

## Load Data

In [6]:
EXTERNAL_DATA_PATH = '../data/external'
RAW_DATA_PATH = '../data/raw'

In [3]:
train_data = pd.read_csv('{}/train.csv'.format(RAW_DATA_PATH))
test_data = pd.read_csv('{}/test.csv'.format(RAW_DATA_PATH))

## Preprocess Data

### Create text columns for storing processed text

In [4]:
train_data['text'] = train_data['original_text'].values
test_data['text'] = test_data['original_text'].values

In [5]:
train_data.head()

Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik,text
0,[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e...,kaskus,0,0,0,1,[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e...
1,"@verosvante kita2 aja nitizen yang pada kepo,t...",instagram,0,0,0,0,"@verosvante kita2 aja nitizen yang pada kepo,t..."
2,"""#SidangAhok smg sipenista agama n ateknya mat...",twitter,0,1,1,1,"""#SidangAhok smg sipenista agama n ateknya mat..."
3,@bolususulembang.jkt barusan baca undang2 ini....,instagram,0,0,0,0,@bolususulembang.jkt barusan baca undang2 ini....
4,bikin anak mulu lu nof \nkaga mikir apa kasian...,kaskus,0,0,0,0,bikin anak mulu lu nof \nkaga mikir apa kasian...


### Define Preprocessing Method

#### Translate text-based emojis

In [10]:
import re

# Translate emoticon
emoticon_data_path = '{}/emoticon.txt'.format(EXTERNAL_DATA_PATH)
emoticon_df = pd.read_csv(emoticon_data_path, sep='\t', header=None)
emoticon_dict = dict(zip(emoticon_df[0], emoticon_df[1]))

def translate_emoticon(t):
    for w, v in emoticon_dict.items():
        pattern = re.compile(re.escape(w))
        match = re.search(pattern,t)
        if match:
            t = re.sub(pattern,v,t)
    return t

In [12]:
sample_text = 'seru deh acaranya :-)'

print('Before :', sample_text)
print('After  :', translate_emoticon(sample_text))

Before : seru deh acaranya :-)
After  : seru deh acaranya Bahagia


#### Remove excessive newline

In [20]:
def remove_newline(text):
    return re.sub('\n', ' ',text)

In [22]:
sample_text = 'hari ini\nhari\nsabtu'

print('Before :', sample_text)
print('After  :', remove_newline(sample_text))

Before : hari ini
hari
sabtu
After  : hari ini hari sabtu


#### Remove kaskus formatting

In [23]:
def remove_kaskus_formatting(text):
    text = re.sub('\[', ' [', text)
    text = re.sub('\]', '] ', text)
    text = re.sub('\[quote[^ ]*\].*?\[\/quote\]', ' ', text)
    text = re.sub('\[[^ ]*\]', ' ', text)
    text = re.sub('&quot;', ' ', text)
    return text

In [26]:
sample_text = '[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e]yoiii cuy halo halo bandung[/QUOTE]'

print('Before :', sample_text)
print('After  :', remove_kaskus_formatting(sample_text))

Before : [QUOTE=jessepinkman16;5a50ac34d89b093f368b456e]yoiii cuy halo halo bandung[/QUOTE]
After  :    yoiii cuy halo halo bandung   


#### Remove url

In [27]:
def remove_url(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text)

In [30]:
sample_text = 'kemaren kacao bet de www.instagram.com/lele/lili'

print('Before :', sample_text)
print('After  :', remove_url(sample_text))

Before : kemaren kacao bet de www.instagram.com/lele/lili
After  : kemaren kacao bet de 


#### Remove excessive whitespace

In [31]:
def remove_excessive_whitespace(text):
    return re.sub('  +', ' ', text)

In [32]:
sample_text = 'budi      pergi ke           pasar'

print('Before :', sample_text)
print('After  :', remove_excessive_whitespace(sample_text))

Before : budi      pergi ke           pasar
After  : budi pergi ke pasar


#### Tokenize text

In [34]:
from nltk.tokenize import WordPunctTokenizer

def tokenize_text(text, punct=False):
    text = WordPunctTokenizer().tokenize(text)
    text = [word for word in text if punct or word.isalnum()]
    text = ' '.join(text)
    text = text.strip()
    return text

In [36]:
sample_text = 'kemarin,aku pergi ke dagas.terus ketemu sama Ilham.'

print('Before :', sample_text)
print('After  :', tokenize_text(sample_text))

Before : kemarin,aku pergi ke dagas.terus ketemu sama Ilham.
After  : kemarin aku pergi ke dagas terus ketemu sama Ilham


#### Transform slang words

In [54]:
slang_words = pd.read_csv('{}/slangword.csv'.format(EXTERNAL_DATA_PATH))
slang_dict = dict(zip(slang_words['original'],slang_words['translated']))

def transform_slang_words(text):
    word_list = text.split()
    word_list_len = len(word_list)
    transformed_word_list = []
    i = 0
    while i < word_list_len:
        if (i + 1) < word_list_len:
            two_words = ' '.join(word_list[i:i+2])
            if two_words in slang_dict:
                transformed_word_list.append(slang_dict[two_words])
                i += 2
                continue
        transformed_word_list.append(slang_dict.get(word_list[i], word_list[i]))
        i += 1
    return ' '.join(transformed_word_list)

In [56]:
sample_text = 'siap mas sebentar lagi saya sampai 7an'

print('Before :', sample_text)
print('After  :', transform_slang_words(sample_text))

Before : siap mas sebentar lagi saya sampai 7an
After  : siap mas sebentar lagi saya sampai tujuan


#### Remove non aplhabet

In [57]:
def remove_non_alphabet(text):
    output = re.sub('[^a-zA-Z ]+', '', text)
    return output

In [58]:
sample_text = 'kemaren tu123 ada kelinci di kebun'

print('Before :', sample_text)
print('After  :', remove_non_alphabet(sample_text))

Before : kemaren tu123 ada kelinci di kebun
After  : kemaren tu ada kelinci di kebun
