## Install requirements

In [None]:
!pip3 install fairseq nlpaug fastbpe tensorboardX torch transformers sentencepiece simpletransformers nltk gensim librosa matplotlib sacremoses


In [None]:
!wget https://public.vinai.io/PhoBERT_base_fairseq.tar.gz
!tar -xzvf PhoBERT_base_fairseq.tar.gz
!rm -rf PhoBERT_base_fairseq.tar.gz


In [None]:
!pip3 install vncorenlp
!mkdir - p vncorenlp/models/wordsegmenter
!wget https: // raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https: // raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https: // raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/


## Connect to SQLite

In [1]:
import sqlite3
import pandas as pd
from fairseq.models.roberta import RobertaModel
from vncorenlp import VnCoreNLP
from fairseq import options
from fairseq.data.encoders.fastbpe import fastBPE
import numpy as np
import nlpaug.augmenter.word as naw
import re


In [2]:
conn = sqlite3.connect("./raw/data_train.db")
crs = conn.cursor()


In [3]:
crs.execute("SELECT * FROM data")
rows = crs.fetchall()
df = pd.DataFrame(
    rows, columns=['index', 'STT' , 'emotion', 'title', 'content'])


In [4]:
# 0 Chán ghét -> 3 first similiar
# 1 Thích thú -> 1 first similiar
# 2 Buồn bã -> 3 first similiar
# 3 Sợ hãi -> 3 first similiar
# 4 Giận dữ -> 3 first similiar
# 5 Khac -> 3 first similiar


In [5]:
df_gb = df.groupby('emotion')
df_disgust = df_gb.get_group(0)
df_happy = df_gb.get_group(1)
df_sad = df_gb.get_group(2)
df_fear = df_gb.get_group(3)
df_angry = df_gb.get_group(4)
print(df_disgust.shape)
print(df_happy.shape)
print(df_sad.shape)
print(df_fear.shape)
print(df_angry.shape)


(1060, 5)
(1610, 5)
(1510, 5)
(330, 5)
(410, 5)


In [6]:
disgust_contents = list(df_disgust.sample(650)['content'].apply(str))
happy_contents = list(df_happy.sample(100)['content'].apply(str))
sad_contents = list(df_sad.sample(200)['content'].apply(str))
fear_contents = list(df_fear.sample(150)['content'].apply(str))
angry_contents = list(df_angry['content'].apply(str))


## Load model to fairseq

In [7]:
# Load the model in fairseq
phoBERT = RobertaModel.from_pretrained(
    'PhoBERT_base_fairseq/', checkpoint_file='model.pt')
phoBERT.eval()  # disable dropout (or leave in train mode to finetune


2022-11-15 01:24:11 | INFO | fairseq.file_utils | loading archive file PhoBERT_base_fairseq/
2022-11-15 01:24:13 | INFO | fairseq.tasks.masked_lm | dictionary: 64000 types
2022-11-15 01:24:18 | INFO | fairseq.models.roberta.model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 1, 'log_format': 'simple', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': '', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'qua

RobertaHubInterface(
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (sentence_encoder): TransformerEncoder(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(64001, 768, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(258, 768, padding_idx=1)
        (layernorm_embedding): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0): TransformerEncoderLayerBase(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout_module

## Tokenization

In [8]:
# See more details at: https://github.com/vncorenlp/VnCoreNLP

# Load rdrsegmenter from VnCoreNLP
rdrsegmenter = VnCoreNLP(
    "./vncorenlp/VnCoreNLP-1.1.1.jar", max_heap_size='-Xmx500m', annotators="wseg")


2022-11-15 01:24:21 | INFO | vncorenlp.vncorenlp | Starting server on: http://127.0.0.1:57172
2022-11-15 01:24:21 | INFO | vncorenlp.vncorenlp | Server ID: 51759
2022-11-15 01:24:21 | INFO | vncorenlp.vncorenlp | Waiting until the server is available...
2022-11-15 01:24:26 | INFO | vncorenlp.vncorenlp | The server is now available on: http://127.0.0.1:57172


## Back translation initialization

In [9]:
# back_translation_aug = naw.BackTranslationAug(
#     from_model_name='Helsinki-NLP/opus-mt-vi-en',
#     to_model_name='Helsinki-NLP/opus-mt-en-vi')


In [10]:
# back_translation_aug = naw.BackTranslationAug(
#     from_model_name='Helsinki-NLP/opus-mt-vi-fr',
#     to_model_name='Helsinki-NLP/opus-mt-fr-vi')


In [11]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-vi-de',
    to_model_name='Helsinki-NLP/opus-mt-de-vi')


Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/298M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/298M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

In [12]:
# back_translation_aug = naw.BackTranslationAug(
#     from_model_name='Helsinki-NLP/opus-mt-vi-ru',
#     to_model_name='Helsinki-NLP/opus-mt-ru-vi')


## Slot-filling

In [13]:
# Khởi tạo Byte Pair Encoding cho PhoBERT

class BPE():
  bpe_codes = 'PhoBERT_base_fairseq/bpe.codes'

args = BPE()
phoBERT.bpe = fastBPE(args)  # Incorporate the BPE encoder into PhoBERT


Loading codes from PhoBERT_base_fairseq/bpe.codes ...
Read 64000 codes from the codes file.


In [None]:
new_disgust_contents = []
for idx, content in enumerate(disgust_contents):
    print("Processing: ", idx,"/",len(disgust_contents))
    try:
        sentences = rdrsegmenter.tokenize(content)
        tmp_paragraph1 = ""
        tmp_paragraph2 = ""
        for sentence in sentences:
            tmp_sentence_arr = []
            for i in range(len(sentence)):
                tmp = sentence[i]
                sentence[i] = " <mask>"
                mask = ' '.join(sentence)
                sentence[i] = tmp
                topk_filled_outputs = phoBERT.fill_mask(mask, topk=10)
                topk_filled_outputs.sort(key=lambda x: x[1], reverse=True)
                for k in range(10):
                    if topk_filled_outputs[k][2] != tmp:
                        tmp_sentence_arr.append(topk_filled_outputs[k])
                        break
            tmp_sentence_arr.sort(key=lambda x: x[1], reverse=True)
            tmp_paragraph1 += tmp_sentence_arr[0][0] + " "
            tmp_paragraph2 += tmp_sentence_arr[1][0] + " "
        if (tmp_paragraph1 != ""):
            new_disgust_contents.append(tmp_paragraph1)
        if (tmp_paragraph2 != ""):
            new_disgust_contents.append(tmp_paragraph2)
    except:
        pass
    
print(len(new_disgust_contents))


In [14]:
new_disgust_contents = []
for idx, content in enumerate(disgust_contents):
    print("Processing: ", idx, "/", len(disgust_contents))
    try:
        backtrans = back_translation_aug.augment(content)
        new_disgust_contents.append(backtrans)
    except:
        pass

print(len(new_disgust_contents))


Processing:  0 / 650
Processing:  1 / 650
Processing:  2 / 650
Processing:  3 / 650
Processing:  4 / 650
Processing:  5 / 650
Processing:  6 / 650
Processing:  7 / 650
Processing:  8 / 650
Processing:  9 / 650
Processing:  10 / 650
Processing:  11 / 650
Processing:  12 / 650
Processing:  13 / 650
Processing:  14 / 650
Processing:  15 / 650
Processing:  16 / 650
Processing:  17 / 650
Processing:  18 / 650
Processing:  19 / 650
Processing:  20 / 650
Processing:  21 / 650
Processing:  22 / 650
Processing:  23 / 650
Processing:  24 / 650
Processing:  25 / 650
Processing:  26 / 650
Processing:  27 / 650
Processing:  28 / 650
Processing:  29 / 650
Processing:  30 / 650
Processing:  31 / 650
Processing:  32 / 650
Processing:  33 / 650
Processing:  34 / 650
Processing:  35 / 650
Processing:  36 / 650
Processing:  37 / 650
Processing:  38 / 650
Processing:  39 / 650
Processing:  40 / 650
Processing:  41 / 650
Processing:  42 / 650
Processing:  43 / 650
Processing:  44 / 650
Processing:  45 / 65

In [None]:
new_happy_contents = []
for idx, content in enumerate(happy_contents):
    print("Processing: ", idx, "/", len(happy_contents))
    try:
        sentences = rdrsegmenter.tokenize(content)
        tmp_paragraph1 = ""
        for sentence in sentences:
            tmp_sentence_arr = []
            for i in range(len(sentence)):
                tmp = sentence[i]
                sentence[i] = " <mask>"
                mask = ' '.join(sentence)
                sentence[i] = tmp
                topk_filled_outputs = phoBERT.fill_mask(mask, topk=10)
                topk_filled_outputs.sort(key=lambda x: x[1], reverse=True)
                for k in range(2):
                    if topk_filled_outputs[k][2] != tmp:
                        tmp_sentence_arr.append(topk_filled_outputs[k])
                        break
            tmp_sentence_arr.sort(key=lambda x: x[1], reverse=True)
            tmp_paragraph1 += tmp_sentence_arr[0][0] + " "
        if (tmp_paragraph1 != ""):
            new_happy_contents.append(tmp_paragraph1)
    except:
        pass

print(len(new_happy_contents))


In [14]:
new_happy_contents = []
for idx, content in enumerate(happy_contents):
    print("Processing: ", idx+300, "/", len(happy_contents))
    try:
        backtrans = back_translation_aug.augment(content)
        new_happy_contents.append(backtrans)
    except:
        pass
    
print(len(new_happy_contents))


Processing:  300 / 100
Processing:  301 / 100
Processing:  302 / 100
Processing:  303 / 100
Processing:  304 / 100
Processing:  305 / 100
Processing:  306 / 100
Processing:  307 / 100
Processing:  308 / 100
Processing:  309 / 100
Processing:  310 / 100
Processing:  311 / 100
Processing:  312 / 100
Processing:  313 / 100
Processing:  314 / 100
Processing:  315 / 100
Processing:  316 / 100
Processing:  317 / 100
Processing:  318 / 100
Processing:  319 / 100
Processing:  320 / 100
Processing:  321 / 100
Processing:  322 / 100
Processing:  323 / 100
Processing:  324 / 100
Processing:  325 / 100
Processing:  326 / 100
Processing:  327 / 100
Processing:  328 / 100
Processing:  329 / 100
Processing:  330 / 100
Processing:  331 / 100
Processing:  332 / 100
Processing:  333 / 100
Processing:  334 / 100
Processing:  335 / 100
Processing:  336 / 100
Processing:  337 / 100
Processing:  338 / 100
Processing:  339 / 100
Processing:  340 / 100
Processing:  341 / 100
Processing:  342 / 100
Processing:

In [None]:
new_sad_contents = []
for idx, content in enumerate(sad_contents):
      print("Processing: ", idx, "/", len(sad_contents))
      try:
         sentences = rdrsegmenter.tokenize(content)
         tmp_paragraph1 = ""
         tmp_paragraph2 = ""
         for sentence in sentences:
               tmp_sentence_arr = []
               for i in range(len(sentence)):
                  tmp = sentence[i]
                  sentence[i] = " <mask>"
                  mask = ' '.join(sentence)
                  sentence[i] = tmp
                  topk_filled_outputs = phoBERT.fill_mask(mask, topk=10)
                  topk_filled_outputs.sort(key=lambda x: x[1], reverse=True)
                  for k in range(2):
                     if topk_filled_outputs[k][2] != tmp:
                           tmp_sentence_arr.append(topk_filled_outputs[k])
                           break
               tmp_sentence_arr.sort(key=lambda x: x[1], reverse=True)
               tmp_paragraph1 += tmp_sentence_arr[0][0] + " "
               tmp_paragraph2 += tmp_sentence_arr[1][0] + " "
         if (tmp_paragraph1 != ""):
               new_sad_contents.append(tmp_paragraph1)
         if (tmp_paragraph2 != ""):
               new_sad_contents.append(tmp_paragraph2)
      except:
         pass
print(len(new_sad_contents))

In [16]:
new_sad_contents = []
for idx, content in enumerate(sad_contents):
   print("Processing: ", idx, "/", len(sad_contents))
   try:
      backtrans = back_translation_aug.augment(content)
      new_sad_contents.append(backtrans)
   except:
      pass

print(len(new_sad_contents))


Processing:  0 / 200
Processing:  1 / 200
Processing:  2 / 200
Processing:  3 / 200
Processing:  4 / 200
Processing:  5 / 200
Processing:  6 / 200
Processing:  7 / 200
Processing:  8 / 200
Processing:  9 / 200
Processing:  10 / 200
Processing:  11 / 200
Processing:  12 / 200
Processing:  13 / 200
Processing:  14 / 200
Processing:  15 / 200
Processing:  16 / 200
Processing:  17 / 200
Processing:  18 / 200
Processing:  19 / 200
Processing:  20 / 200
Processing:  21 / 200
Processing:  22 / 200
Processing:  23 / 200
Processing:  24 / 200
Processing:  25 / 200
Processing:  26 / 200
Processing:  27 / 200
Processing:  28 / 200
Processing:  29 / 200
Processing:  30 / 200
Processing:  31 / 200
Processing:  32 / 200
Processing:  33 / 200
Processing:  34 / 200
Processing:  35 / 200
Processing:  36 / 200
Processing:  37 / 200
Processing:  38 / 200
Processing:  39 / 200
Processing:  40 / 200
Processing:  41 / 200
Processing:  42 / 200
Processing:  43 / 200
Processing:  44 / 200
Processing:  45 / 20

In [14]:
new_fear_contents = []
for idx, content in enumerate(fear_contents):
      print("Processing: ", idx, "/", len(fear_contents))
      try:
         sentences = rdrsegmenter.tokenize(content)
         tmp_paragraph1 = ""
         tmp_paragraph2 = ""
         for sentence in sentences:
               tmp_sentence_arr = []
               for i in range(len(sentence)):
                  tmp = sentence[i]
                  sentence[i] = " <mask>"
                  mask = ' '.join(sentence)
                  sentence[i] = tmp
                  topk_filled_outputs = phoBERT.fill_mask(mask, topk=10)
                  topk_filled_outputs.sort(key=lambda x: x[1], reverse=True)
                  for k in range(2):
                     if topk_filled_outputs[k][2] != tmp:
                           tmp_sentence_arr.append(topk_filled_outputs[k])
                           break
               tmp_sentence_arr.sort(key=lambda x: x[1], reverse=True)
               tmp_paragraph1 += tmp_sentence_arr[0][0] + " "
               tmp_paragraph2 += tmp_sentence_arr[1][0] + " "
         if (tmp_paragraph1 != ""):
               new_fear_contents.append(tmp_paragraph1)
         if (tmp_paragraph2 != ""):
               new_fear_contents.append(tmp_paragraph2)
      except:
         pass
print(len(new_fear_contents))

Processing:  0 / 330
Processing:  1 / 330
Processing:  2 / 330
Processing:  3 / 330
Processing:  4 / 330
Processing:  5 / 330
Processing:  6 / 330
Processing:  7 / 330
Processing:  8 / 330
Processing:  9 / 330
Processing:  10 / 330
Processing:  11 / 330
Processing:  12 / 330
Processing:  13 / 330
Processing:  14 / 330
Processing:  15 / 330
Processing:  16 / 330
Processing:  17 / 330
Processing:  18 / 330
Processing:  19 / 330
Processing:  20 / 330
Processing:  21 / 330
Processing:  22 / 330
Processing:  23 / 330
Processing:  24 / 330
Processing:  25 / 330
Processing:  26 / 330
Processing:  27 / 330
Processing:  28 / 330
Processing:  29 / 330
Processing:  30 / 330
Processing:  31 / 330
Processing:  32 / 330
Processing:  33 / 330
Processing:  34 / 330
Processing:  35 / 330
Processing:  36 / 330
Processing:  37 / 330
Processing:  38 / 330
Processing:  39 / 330
Processing:  40 / 330
Processing:  41 / 330
Processing:  42 / 330
Processing:  43 / 330
Processing:  44 / 330
Processing:  45 / 33

In [14]:
new_fear_contents = []
for idx, content in enumerate(fear_contents):
   print("Processing: ", idx, "/", len(fear_contents))
   try:
      backtrans = back_translation_aug.augment(content)
      new_fear_contents.append(backtrans)
   except:
      pass
print(len(new_fear_contents))


Processing:  0 / 150
Processing:  1 / 150
Processing:  2 / 150
Processing:  3 / 150
Processing:  4 / 150
Processing:  5 / 150
Processing:  6 / 150
Processing:  7 / 150
Processing:  8 / 150
Processing:  9 / 150
Processing:  10 / 150
Processing:  11 / 150
Processing:  12 / 150
Processing:  13 / 150
Processing:  14 / 150
Processing:  15 / 150
Processing:  16 / 150
Processing:  17 / 150
Processing:  18 / 150
Processing:  19 / 150
Processing:  20 / 150
Processing:  21 / 150
Processing:  22 / 150
Processing:  23 / 150
Processing:  24 / 150
Processing:  25 / 150
Processing:  26 / 150
Processing:  27 / 150
Processing:  28 / 150
Processing:  29 / 150
Processing:  30 / 150
Processing:  31 / 150
Processing:  32 / 150
Processing:  33 / 150
Processing:  34 / 150
Processing:  35 / 150
Processing:  36 / 150
Processing:  37 / 150
Processing:  38 / 150
Processing:  39 / 150
Processing:  40 / 150
Processing:  41 / 150
Processing:  42 / 150
Processing:  43 / 150
Processing:  44 / 150
Processing:  45 / 15

In [16]:
new_angry_contents = []
for idx, content in enumerate(angry_contents):
      print("Processing: ", idx, "/", len(angry_contents))
      try:
         sentences = rdrsegmenter.tokenize(content)
         tmp_paragraph1 = ""
         tmp_paragraph2 = ""
         for sentence in sentences:
               tmp_sentence_arr = []
               for i in range(len(sentence)):
                  tmp = sentence[i]
                  sentence[i] = " <mask>"
                  mask = ' '.join(sentence)
                  sentence[i] = tmp
                  topk_filled_outputs = phoBERT.fill_mask(mask, topk=10)
                  topk_filled_outputs.sort(key=lambda x: x[1], reverse=True)
                  for k in range(2):
                     if topk_filled_outputs[k][2] != tmp:
                           tmp_sentence_arr.append(topk_filled_outputs[k])
                           break
               tmp_sentence_arr.sort(key=lambda x: x[1], reverse=True)
               tmp_paragraph1 += tmp_sentence_arr[0][0] + " "
               tmp_paragraph2 += tmp_sentence_arr[1][0] + " "
         if (tmp_paragraph1 != ""):
               new_angry_contents.append(tmp_paragraph1)
         if (tmp_paragraph2 != ""):
               new_angry_contents.append(tmp_paragraph2)
      except:
         pass
print(len(new_angry_contents))

Processing:  0 / 410
Processing:  1 / 410
Processing:  2 / 410
Processing:  3 / 410
Processing:  4 / 410
Processing:  5 / 410
Processing:  6 / 410
Processing:  7 / 410
Processing:  8 / 410
Processing:  9 / 410
Processing:  10 / 410
Processing:  11 / 410
Processing:  12 / 410
Processing:  13 / 410
Processing:  14 / 410
Processing:  15 / 410
Processing:  16 / 410
Processing:  17 / 410
Processing:  18 / 410
Processing:  19 / 410
Processing:  20 / 410
Processing:  21 / 410
Processing:  22 / 410
Processing:  23 / 410
Processing:  24 / 410
Processing:  25 / 410
Processing:  26 / 410
Processing:  27 / 410
Processing:  28 / 410
Processing:  29 / 410
Processing:  30 / 410
Processing:  31 / 410
Processing:  32 / 410
Processing:  33 / 410
Processing:  34 / 410
Processing:  35 / 410
Processing:  36 / 410
Processing:  37 / 410
Processing:  38 / 410
Processing:  39 / 410
Processing:  40 / 410
Processing:  41 / 410
Processing:  42 / 410
Processing:  43 / 410
Processing:  44 / 410
Processing:  45 / 41

In [16]:
new_angry_contents = []
for idx, content in enumerate(angry_contents):
   print("Processing: ", idx, "/", len(angry_contents))
   try:
      backtrans = back_translation_aug.augment(content)
      new_angry_contents.append(backtrans)
   except:
      pass
print(len(new_angry_contents))


Processing:  0 / 410
Processing:  1 / 410
Processing:  2 / 410
Processing:  3 / 410
Processing:  4 / 410
Processing:  5 / 410
Processing:  6 / 410
Processing:  7 / 410
Processing:  8 / 410
Processing:  9 / 410
Processing:  10 / 410
Processing:  11 / 410
Processing:  12 / 410
Processing:  13 / 410
Processing:  14 / 410
Processing:  15 / 410
Processing:  16 / 410
Processing:  17 / 410
Processing:  18 / 410
Processing:  19 / 410
Processing:  20 / 410
Processing:  21 / 410
Processing:  22 / 410
Processing:  23 / 410
Processing:  24 / 410
Processing:  25 / 410
Processing:  26 / 410
Processing:  27 / 410
Processing:  28 / 410
Processing:  29 / 410
Processing:  30 / 410
Processing:  31 / 410
Processing:  32 / 410
Processing:  33 / 410
Processing:  34 / 410
Processing:  35 / 410
Processing:  36 / 410
Processing:  37 / 410
Processing:  38 / 410
Processing:  39 / 410
Processing:  40 / 410
Processing:  41 / 410
Processing:  42 / 410
Processing:  43 / 410
Processing:  44 / 410
Processing:  45 / 41

## Insert data

In [21]:
conn1 = sqlite3.connect("./generate_data/data_train.db")
crs1 = conn1.cursor()
crs.execute("SELECT * FROM data ORDER BY `index` DESC LIMIT 1")
last_index = crs.fetchone()[0]
for idx, content in enumerate(new_disgust_contents):
    content = str(content)
    content = re.sub(' +', ' ', content)
    content = re.sub(' ([.,!?()])', r'\1', content)
    content = content.replace("_", " ")
    content = content.replace("[", " ")
    content = content.replace("]", " ")
    crs1.execute("INSERT INTO data(`index`, `STT`, `Emotion`, `Title`, `Posts`) VALUES(?, ?, ?, ?, ?)",
                 (int(last_index + idx + 1),
                  int(last_index + idx + 1), 0, "", str(content)))
conn1.commit()
crs1.fetchall()


[]

In [15]:
conn1 = sqlite3.connect("./generate_data/data_train.db")
crs1 = conn1.cursor()
crs.execute("SELECT * FROM data ORDER BY `index` DESC LIMIT 1")
last_index = crs.fetchone()[0]
for idx, content in enumerate(new_happy_contents):
   content = str(content)
   content = re.sub(' +', ' ', content)
   content = re.sub(' ([.,!?()])', r'\1', content)
   content = content.replace("_", " ")
   content = content.replace("[", " ")
   content = content.replace("]", " ")
   crs1.execute("INSERT INTO data(`index`, `STT`, `Emotion`, `Title`, `Posts`) VALUES(?, ?, ?, ?, ?)",
                (int(last_index + idx + 1),
                 int(last_index + idx + 1), 1, "", str(content)))
conn1.commit()
crs1.fetchall()


[]

In [17]:
conn1 = sqlite3.connect("./generate_data/data_train.db")
crs1 = conn1.cursor()
crs.execute("SELECT * FROM data ORDER BY `index` DESC LIMIT 1")
last_index = crs.fetchone()[0]
for idx, content in enumerate(new_sad_contents):
   content = str(content)
   content = re.sub(' +', ' ', content)
   content = re.sub(' ([.,!?()])', r'\1', content)
   content = content.replace("_", " ")
   content = content.replace("[", " ")
   content = content.replace("]", " ")
   crs1.execute("INSERT INTO data(`index`, `STT`, `Emotion`, `Title`, `Posts`) VALUES(?, ?, ?, ?, ?)",
                (int(last_index + idx + 1),
                 int(last_index + idx + 1), 2, "", str(content)))
conn1.commit()
crs1.fetchall()


[]

In [15]:
conn1 = sqlite3.connect("./generate_data/data_train.db")
crs1 = conn1.cursor()
crs.execute("SELECT * FROM data ORDER BY `index` DESC LIMIT 1")
last_index = crs.fetchone()[0]
for idx, content in enumerate(new_fear_contents):
   content = str(content)
   content = re.sub(' +', ' ', content)
   content = re.sub(' ([.,!?()])', r'\1', content)
   content = content.replace("_", " ")
   content = content.replace("[", " ")
   content = content.replace("]", " ")
   crs1.execute("INSERT INTO data(`index`, `STT`, `Emotion`, `Title`, `Posts`) VALUES(?, ?, ?, ?, ?)",
                (int(last_index + idx + 1),
                 int(last_index + idx + 1), 3, "", str(content)))
conn1.commit()
crs1.fetchall()


[]

In [17]:
conn1 = sqlite3.connect("./generate_data/data_train.db")
crs1 = conn1.cursor()
crs.execute("SELECT * FROM data ORDER BY `index` DESC LIMIT 1")
last_index = crs.fetchone()[0]
for idx, content in enumerate(new_angry_contents):
   content = str(content)
   content = re.sub(' +', ' ', content)
   content = re.sub(' ([.,!?()])', r'\1', content)
   content = content.replace("_", " ")
   content = content.replace("[", " ")
   content = content.replace("]", " ")
   crs1.execute("INSERT INTO data(`index`, `STT`, `Emotion`, `Title`, `Posts`) VALUES(?, ?, ?, ?, ?)",
                (int(last_index + idx + 1),
                 int(last_index + idx + 1), 4, "", str(content)))
conn1.commit()
crs1.fetchall()


[]

## Ref:

- https://viblo.asia/p/paraphrase-text-tieng-viet-phan-1-LzD5dMBYKjY
- https://github.com/vncorenlp/VnCoreNLP