In [1]:
import pandas as pd
import re
import sqlite3
import string
import pickle

In [2]:
conn_train = sqlite3.connect("./raw/data_train.db")
crs_train = conn_train.cursor()
data_train = pd.read_sql_query("SELECT * FROM data ORDER BY RANDOM()", conn_train)

conn_test = sqlite3.connect("./raw/data_test.db")
crs_test = conn_test.cursor()
data_test = pd.read_sql_query("SELECT * FROM data ORDER BY RANDOM()", conn_test)

In [3]:
train_data = pd.DataFrame()
train_data['title'] = data_train.Posts
train_data['label'] = data_train.Emotion

test_data = pd.DataFrame()
test_data['title'] = data_test.Posts
test_data['label'] = data_test.Emotion

In [4]:
from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP("./vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

In [5]:
def removeStopWord():
   f = open("../StopWords/vietnamese-stopwords-dash.txt", "r")
   stopWords = f.readlines()
   for idx,line  in enumerate(stopWords):
      stopWords[idx] = line.replace("\n", "").strip()
      if (len(stopWords[idx]) <= 0):
         stopWords.pop(idx)
   return stopWords

stopWords = removeStopWord()

In [6]:
def removeOutOfDict():
   f = open("../Viet74K.txt", "r")
   dictWords = f.readlines()
   for idx,line  in enumerate(dictWords):
      dictWords[idx] = line.replace("\n", "").strip()
      dictWords[idx] = dictWords[idx].replace(" ", "_")
      if (len(dictWords[idx]) <= 0):
         dictWords.pop(idx)
   return dictWords

dictWords = removeOutOfDict()
print("Dict size: ", len(dictWords))
print(dictWords[:5])

Dict size:  73901
['a', 'A', 'a-ba-giua', 'a-ba-toa', 'a_bàng']


In [7]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


In [8]:
with open('../Emoticon_Dict.p', 'rb') as fp:
    Emoticon_Dict = pickle.load(fp)

def remove_emoticons(text):
    emoticon_pattern = re.compile(
        u'(' + u'|'.join(k for k in Emoticon_Dict) + u')')
    return emoticon_pattern.sub(r'', text)


In [9]:
with open('../teencode.txt', 'rb') as f:
    teencode = f.read().decode('utf-8').split('\n')
    list_teencode = []
    for i in teencode:
        list_teencode.append(i.split('\t'))

def change_teencode(text):
    text_list = text.split(" ")
    for i in range(len(text_list)):
        for j in list_teencode:
            if text_list[i] == j[0]:
                text_list[i] = j[1]
    return " ".join(text_list)
    

In [10]:
puncs = string.punctuation + '“”’—–… ̣ ̀ ́ ̃ ̉'
puncs = puncs.replace("_", "")
puncs = puncs.replace(" ", "")
print(puncs)

!"#$%&'()*+,-./:;<=>?@[\]^`{|}~“”’—–…̣̀́̃̉


In [11]:
train_text = train_data['title'].to_list()
train_labels = train_data['label'].to_list()
train_text = [re.sub(r'^\s+|\s+$', '', str(t)) for t in train_text]
train_text = [str(t).strip() for t in train_text]
train_text = [re.sub(r'\s\s+', ' ', str(t)) for t in train_text]
train_text = [re.sub(r'\n+', ' ', str(t)) for t in train_text]
train_text = [re.sub(r'\d+', '', str(t)) for t in train_text]
train_text = [re.sub(r'http\S+', '', str(t)) for t in train_text]
train_text = [re.sub(r'\.+', '', str(t)) for t in train_text]
train_text = [remove_emoji(str(t)) for t in train_text]
train_text = [remove_emoticons(str(t)) for t in train_text]
train_text = [re.sub(r'[{}]'.format(puncs), '', str(t)) for t in train_text]
train_text = [str(t).replace("_", " ") for t in train_text]
train_text = [str(t).strip() for t in train_text]
train_text = [str(t).lower() for t in train_text]
train_text = [str(t).replace("cre", "") for t in train_text]
train_text = [str(t).replace("post hộ mem", "") for t in train_text]
train_text = [str(t).replace("posthộ", "") for t in train_text]
train_text = [str(t).replace("post hộ", "") for t in train_text]
train_text = [change_teencode(str(t)) for t in train_text]
train_text = [rdrsegmenter.tokenize(t) for t in train_text]
for idx, text in enumerate(train_text):
    tmp = []
    for idx2, text2 in enumerate(text):
        tmp2 = []
        for idx3, text3 in enumerate(text2):
            if text3 not in puncs:
                tmp2.append(" "+text3)
        tmp.append("".join(tmp2).strip())
    train_text[idx] = " ".join(tmp).strip()
train_text = [re.sub(r'\s+_\s+', '_', str(t)) for t in train_text]
train_text = [" ".join([word for word in text.split(
    " ") if word not in stopWords]) for text in train_text]
train_text = [" ".join([word for word in text.split(
    " ") if word in dictWords]) for text in train_text]
train_text = [str(t).strip() for t in train_text]
for idx, text in enumerate(train_text):
    if len(text) <= 0:
        train_text.pop(idx)
        train_labels.pop(idx)
print(train_text[:10])

test_text = test_data['title'].to_list()
test_labels = test_data['label'].to_list()
test_text = [re.sub(r'^\s+|\s+$', '', str(t)) for t in test_text]
test_text = [str(t).strip() for t in test_text]
test_text = [re.sub(r'\s\s+', ' ', str(t)) for t in test_text]
test_text = [re.sub(r'\n+', ' ', str(t)) for t in test_text]
test_text = [re.sub(r'\d+', '', str(t)) for t in test_text]
test_text = [re.sub(r'http\S+', '', str(t)) for t in test_text]
test_text = [re.sub(r'\.+', '', str(t)) for t in test_text]
test_text = [remove_emoji(str(t)) for t in test_text]
test_text = [remove_emoticons(str(t)) for t in test_text]
test_text = [re.sub(r'[{}]'.format(puncs), '', str(t)) for t in test_text]
test_text = [str(t).replace("_", " ") for t in test_text]
test_text = [str(t).strip() for t in test_text]
test_text = [str(t).lower() for t in test_text]
test_text = [str(t).replace("cre", "") for t in test_text]
test_text = [str(t).replace("post hộ mem", "") for t in test_text]
test_text = [str(t).replace("posthộ", "") for t in test_text]
test_text = [str(t).replace("post hộ", "") for t in test_text]
test_text = [change_teencode(str(t)) for t in test_text]
test_text = [rdrsegmenter.tokenize(t) for t in test_text]
for idx, text in enumerate(test_text):
    tmp = []
    for idx2, text2 in enumerate(text):
        tmp2 = []
        for idx3, text3 in enumerate(text2):
            if text3 not in puncs:
                tmp2.append(" "+text3)
        tmp.append("".join(tmp2).strip())
    test_text[idx] = " ".join(tmp).strip()
test_text = [re.sub(r'\s+_\s+', '_', str(t)) for t in test_text]
test_text = [" ".join([word for word in text.split(
    " ") if word not in stopWords]) for text in test_text]
test_text = [" ".join([word for word in text.split(
    " ") if word in dictWords]) for text in test_text]
test_text = [str(t).strip() for t in test_text]
for idx, text in enumerate(test_text):
    if len(text) <= 0:
        test_text.pop(idx)
        test_labels.pop(idx)


['tao xong cầm bút tay_phải', 'đi_ngoài đường trái sống giỏi đấy', 'ma tơi ném cốc mặt đừng trách', 'ảnh tỉ đăng đăng cặc đéo nhàm bất_hạnh vãi', 'kết có_hậu đầu khấc vợ tương_lai kia kìa', 'xàm lồn giáo_dục đéo', 'bệnh bé quyển sổ bảo mày viết xong cầm bút run_run bảo tao viết dòng chả vẽ bậy vào_sổ vứt tiu tờ giấy xé kỉ_niệm dọn giấy_tờ lỗi ông_bà trân_trọng chia xuân nhi', 'thế_giới tồn_tại hai chữ công_bằng sai ta thông_thường vạn phạm sai_lầm hít thở soi_xét để_tâm kẻ sống ngày_mai tươi_đẹp hổ_thẹn hôm_nay', 'đéo mẹ bệnh_viện chỗ gửi xe niêm_yết rõ_ràng kêu_ca', 'tiếc đẹp_trai mờ chim']


In [12]:
conn_train_dirty = sqlite3.connect("./raw/data_train_dirty.db")
crs_train_dirty = conn_train_dirty.cursor()

conn_test_dirty = sqlite3.connect("./raw/data_test_dirty.db")
crs_test_dirty = conn_test_dirty.cursor()


In [13]:
data_train_dirty = pd.DataFrame()
data_test_dirty = pd.DataFrame()
data_train_dirty['Posts'] = train_text
data_train_dirty['Emotion'] = train_labels
data_test_dirty['Posts'] = test_text
data_test_dirty['Emotion'] = test_labels
data_train_dirty.to_sql('data', conn_train_dirty,
                        if_exists='replace', index=False)
data_test_dirty.to_sql('data', conn_test_dirty,
                       if_exists='replace', index=False)


537

In [12]:
conn_train_clean = sqlite3.connect("./raw/data_train_clean.db")
crs_train_clean = conn_train_clean.cursor()

conn_test_clean = sqlite3.connect("./raw/data_test_clean.db")
crs_test_clean = conn_test_clean.cursor()


In [13]:
data_train_clean = pd.DataFrame()
data_test_clean = pd.DataFrame()
data_train_clean['Posts'] = train_text
data_train_clean['Emotion'] = train_labels
data_test_clean['Posts'] = test_text
data_test_clean['Emotion'] = test_labels
data_train_clean.to_sql('data', conn_train_clean,
                        if_exists='replace', index=False)
data_test_clean.to_sql('data', conn_test_clean,
                       if_exists='replace', index=False)


533