In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
EXTERNAL_DATA_PATH = '../data/external'
RAW_DATA_PATH = '../data/raw'
PROCESSED_DATA_PATH = '../data/processed'

In [3]:
train_data = pd.read_csv('{}/train.csv'.format(RAW_DATA_PATH))
test_data = pd.read_csv('{}/test.csv'.format(RAW_DATA_PATH))

In [4]:
train_data.head()

Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik
0,[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e...,kaskus,0,0,0,1
1,"@verosvante kita2 aja nitizen yang pada kepo,t...",instagram,0,0,0,0
2,"""#SidangAhok smg sipenista agama n ateknya mat...",twitter,0,1,1,1
3,@bolususulembang.jkt barusan baca undang2 ini....,instagram,0,0,0,0
4,bikin anak mulu lu nof \nkaga mikir apa kasian...,kaskus,0,0,0,0


## Preprocess Data

### Define Preprocessing Method

#### Translate text-based emojis

In [5]:
import re

# Translate emoticon
emoticon_data_path = '{}/emoticon.txt'.format(EXTERNAL_DATA_PATH)
emoticon_df = pd.read_csv(emoticon_data_path, sep='\t', header=None)
emoticon_dict = dict(zip(emoticon_df[0], emoticon_df[1]))

def translate_emoticon(t):
    for w, v in emoticon_dict.items():
        pattern = re.compile(re.escape(w))
        match = re.search(pattern,t)
        if match:
            t = re.sub(pattern,v,t)
    return t

In [6]:
sample_text = 'seru deh acaranya :-)'

print('Before :', sample_text)
print('After  :', translate_emoticon(sample_text))

Before : seru deh acaranya :-)
After  : seru deh acaranya Bahagia


#### Remove excessive newline

In [7]:
def remove_newline(text):
    return re.sub('\n', ' ',text)

In [8]:
sample_text = 'hari ini\nhari\nsabtu'

print('Before :', sample_text)
print('After  :', remove_newline(sample_text))

Before : hari ini
hari
sabtu
After  : hari ini hari sabtu


#### Remove kaskus formatting

In [9]:
def remove_kaskus_formatting(text):
    text = re.sub('\[', ' [', text)
    text = re.sub('\]', '] ', text)
    text = re.sub('\[quote[^ ]*\].*?\[\/quote\]', ' ', text)
    text = re.sub('\[[^ ]*\]', ' ', text)
    text = re.sub('&quot;', ' ', text)
    return text

In [10]:
sample_text = '[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e]yoiii cuy halo halo bandung[/QUOTE]'

print('Before :', sample_text)
print('After  :', remove_kaskus_formatting(sample_text))

Before : [QUOTE=jessepinkman16;5a50ac34d89b093f368b456e]yoiii cuy halo halo bandung[/QUOTE]
After  :    yoiii cuy halo halo bandung   


#### Remove url

In [11]:
def remove_url(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text)

In [12]:
sample_text = 'kemaren kacao bet de www.instagram.com/lele/lili'

print('Before :', sample_text)
print('After  :', remove_url(sample_text))

Before : kemaren kacao bet de www.instagram.com/lele/lili
After  : kemaren kacao bet de 


#### Remove excessive whitespace

In [13]:
def remove_excessive_whitespace(text):
    return re.sub('  +', ' ', text)

In [14]:
sample_text = 'budi      pergi ke           pasar'

print('Before :', sample_text)
print('After  :', remove_excessive_whitespace(sample_text))

Before : budi      pergi ke           pasar
After  : budi pergi ke pasar


#### Tokenize text

In [15]:
from nltk.tokenize import WordPunctTokenizer

def tokenize_text(text, punct=False):
    text = WordPunctTokenizer().tokenize(text)
    text = [word for word in text if punct or word.isalnum()]
    text = ' '.join(text)
    text = text.strip()
    return text

In [16]:
sample_text = 'kemarin,aku pergi ke dagas.terus ketemu sama Ilham.'

print('Before :', sample_text)
print('After  :', tokenize_text(sample_text))

Before : kemarin,aku pergi ke dagas.terus ketemu sama Ilham.
After  : kemarin aku pergi ke dagas terus ketemu sama Ilham


#### Transform slang words

In [17]:
slang_words = pd.read_csv('{}/slangword.csv'.format(EXTERNAL_DATA_PATH))
slang_dict = dict(zip(slang_words['original'],slang_words['translated']))

def transform_slang_words(text):
    word_list = text.split()
    word_list_len = len(word_list)
    transformed_word_list = []
    i = 0
    while i < word_list_len:
        if (i + 1) < word_list_len:
            two_words = ' '.join(word_list[i:i+2])
            if two_words in slang_dict:
                transformed_word_list.append(slang_dict[two_words])
                i += 2
                continue
        transformed_word_list.append(slang_dict.get(word_list[i], word_list[i]))
        i += 1
    return ' '.join(transformed_word_list)

In [18]:
sample_text = 'siap mas sebentar lagi saya sampai 7an'

print('Before :', sample_text)
print('After  :', transform_slang_words(sample_text))

Before : siap mas sebentar lagi saya sampai 7an
After  : siap mas sebentar lagi saya sampai tujuan


#### Remove non aplhabet

In [19]:
def remove_non_alphabet(text):
    output = re.sub('[^a-zA-Z ]+', '', text)
    return output

In [20]:
sample_text = 'kemaren tu123 ada kelinci di kebun'

print('Before :', sample_text)
print('After  :', remove_non_alphabet(sample_text))

Before : kemaren tu123 ada kelinci di kebun
After  : kemaren tu ada kelinci di kebun


#### Remove twitter & instagram formatting

In [21]:
def remove_twitter_ig_formatting(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'\brt\b', '', text)
    return text

In [22]:
sample_text = '@jonijon menurut saya hal tersebut masih kurang baik dilakukan sih kak'

print('Before :', sample_text)
print('After  :', remove_twitter_ig_formatting(sample_text))

Before : @jonijon menurut saya hal tersebut masih kurang baik dilakukan sih kak
After  :  menurut saya hal tersebut masih kurang baik dilakukan sih kak


### Remove Repeating Characters

In [23]:
import itertools

def remove_repeating_characters(text):
    return ''.join(''.join(s)[:1] for _, s in itertools.groupby(text))

In [24]:
sample_text = 'heyyyyyyyyyyyyyyyyyyyy kenapa tadi?'

print('Before :', sample_text)
print('After  :', remove_repeating_characters(sample_text))

Before : heyyyyyyyyyyyyyyyyyyyy kenapa tadi?
After  : hey kenapa tadi?


#### Final Preprocessing Method

In [25]:
def preprocess_text(text):
    transformed_text = text.lower()
    transformed_text = remove_newline(text)
    transformed_text = remove_url(transformed_text)
    transformed_text = remove_twitter_ig_formatting(transformed_text)
    transformed_text = remove_kaskus_formatting(transformed_text)
    transformed_text = translate_emoticon(transformed_text)
    transformed_text = transformed_text.lower()
    transformed_text = tokenize_text(transformed_text)
    transformed_text = transform_slang_words(transformed_text)
    transformed_text = remove_repeating_characters(transformed_text)
    transformed_text = transform_slang_words(transformed_text)
    transformed_text = remove_non_alphabet(transformed_text)
    transformed_text = remove_excessive_whitespace(transformed_text)
    transformed_text = transformed_text.lower().strip()
    return transformed_text

### Preprocess Data

In [26]:
train_data['processed_text'] = train_data['original_text'].apply(preprocess_text)
test_data['processed_text'] = test_data['original_text'].apply(preprocess_text)

### Save Preprocessed Data

In [27]:
train_data.to_csv('{}/processed_train.csv'.format(PROCESSED_DATA_PATH), index=False)
test_data.to_csv('{}/processed_test.csv'.format(PROCESSED_DATA_PATH), index=False)