20k clean - 1k hate - 1k offensive
20k clean - 10k hate - 10k offensive

# Text augmentation using EDA techniques


In [None]:
# Change these arguments to fit with your own data / project

class Argument:
    input = "/content/data_label_2.csv"
    output = "/content/drive/MyDrive/new_dataset15.5/augmented_dataset_label_2_num8_0.15_wordnet.txt"
    num_aug = 8
    alpha = 0.15
args = Argument()

In [None]:
words = get_synonyms('mặt trời')
print(words)

['Mặt Trời', 'thái dương', 'Thái dương']


In [None]:
# CODE augmetation: https://github.com/jasonwei20/eda_nlp

import random
from random import shuffle

random.seed(1)
import json


# stop words list
stop_words = []
with open("/content/vietnamese-stopwords.txt", "r") as f:
    stop_words = []
    for line in f:
        dd = line.strip('\n')
        stop_words.append(dd)

# cleaning up text
import re


def get_only_chars(line):
    # clean_line = ""

    # line = line.replace("’", "")
    # line = line.replace("'", "")
    # line = line.replace("-", " ") #replace hyphens with spaces
    # line = line.replace("\t", " ")
    # line = line.replace("\n", " ")
    # line = line.lower()

    # for char in line:
    #     if char in 'qwertyuiopasdfghjklzxcvbnm ':
    #         clean_line += char
    #     else:
    #         clean_line += ' '

    # clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    # if clean_line[0] == ' ':
    #     clean_line = clean_line[1:]
    # return clean_line
    return line


########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

# for the first time you use wordnet
# import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet


def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            print("replaced", random_word, "with", synonym)
            num_replaced += 1
        if num_replaced >= n:  # only replace up to n words
            break

    # this is stupid but we need it, trust me
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words


# def get_synonyms(word):
# 	synonyms = set()
# 	for syn in wordnet.synsets(word):
# 		for l in syn.lemmas():
# 			synonym = l.name().replace("_", " ").replace("-", " ").lower()
# 			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
# 			synonyms.add(synonym)
# 	if word in synonyms:
# 		synonyms.remove(word)
# 	return list(synonyms)

def get_synonyms(word):
    synonyms = set()
    with open("/content/word_net_vi.json", "r") as f:
        wordnet = json.load(f)

    for key, value in wordnet.items():
        if key.strip() == word:
            for v in value:
                synonyms.add(v.strip())

        if word in synonyms:
            synonyms.remove(word)
    return list(synonyms)


########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):
    # obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    # randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    # if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words) - 1)
        return [words[rand_int]]

    return new_words


########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        if len(new_words) > 0:
            new_words = swap_word(new_words)
    return new_words


def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words) - 1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words) - 1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words


########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words


def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1 and len(new_words) > 0:
    # while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words) - 1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return

    if len(new_words) > 0:
        random_synonym = synonyms[0]
        random_idx = random.randint(0, len(new_words) - 1)
        new_words.insert(random_idx, random_synonym)


########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
    sentence = get_only_chars(sentence)
    words = sentence.split(' ')
    words = [word for word in words if word != '']
    num_words = len(words)

    augmented_sentences = []

    if len(words) <= 0:
        return augmented_sentences
    num_new_per_technique = int(num_aug / 4) + 1
    n_sr = max(1, int(alpha_sr * num_words))
    n_ri = max(1, int(alpha_ri * num_words))
    n_rs = max(1, int(alpha_rs * num_words))

    # sr
    for _ in range(num_new_per_technique):
        a_words = synonym_replacement(words, n_sr)
        augmented_sentences.append(' '.join(a_words))

    # ri
    for _ in range(num_new_per_technique):
        a_words = random_insertion(words, n_ri)
        augmented_sentences.append(' '.join(a_words))

    # rs
    for _ in range(num_new_per_technique):
        a_words = random_swap(words, n_rs)
        augmented_sentences.append(' '.join(a_words))

    # rd
    for _ in range(num_new_per_technique):
        a_words = random_deletion(words, p_rd)
        augmented_sentences.append(' '.join(a_words))

    augmented_sentences = list(set(augmented_sentences))
    augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
    shuffle(augmented_sentences)

    # trim so that we have the desired number of augmented sentences
    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

    # append the original sentence
    #augmented_sentences.append(sentence)

    return augmented_sentences


# the output file
output = None
if args.output:
    output = args.output
else:
    from os.path import dirname, basename, join

    output = join(dirname(args.input), 'eda_' + basename(args.input))

# number of augmented sentences to generate per original sentence
num_aug = 9  # default
if args.num_aug:
    num_aug = args.num_aug

# how much to change each sentence
alpha = 0.1  # default
if args.alpha:
    alpha = args.alpha


# generate more data with standard augmentation
def gen_eda(train_orig, output_file, alpha, num_aug=9):
    try:
        writer = open(output_file, 'w')
        lines = open(train_orig, 'r').readlines()

        writer.write("free_text" + "," + "label_id" + '\n')
        augm = ""
        for i, line in enumerate(lines):
            try:
                parts = line[:-1].split('|')
                # print(parts)
                # sen_id = parts[0]
                label = parts[1]
                sentence = parts[0]
                aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
                for aug_sentence in aug_sentences:
                    # writer.write(label + "\t" + aug_sentence + '\n')
                    # writer.write(sen_id + "," +aug_sentence + "," + label + '\n')
                    augm = augm + aug_sentence + "," + label + '\n'
            except Exception as e:
                print(e)
                print(parts)
                pass

        writer.write(augm)
        writer.close()
        print(
            "generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(
                num_aug))
    except Exception as e:
        raise e
        pass

## Main code. Run the cell below to generate new texts

In [None]:
# main function. Run this cell to generate new data
if __name__ == "__main__":
    # generate augmented sentences and output into a new file
    gen_eda(args.input, args.output, alpha=alpha, num_aug=num_aug)


[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
replaced lề with nghi tiết
replaced thằng with gã
replaced cộng with gộp
replaced lề with nghi thức
replaced cộng with lấy tổng
replaced lề with lề thói
replaced cai with cai rượu
replaced cai with cai rượu
replaced bi with bi thuốc
replaced chua with lên men
replaced quan with quan lại
replaced tham with dã tâm
replaced vay with vay mượn
replaced am with thất
replaced phai with loang ra
replaced y with y khoa
replaced bao with quây quanh
replaced hai with II
replaced da with da động vật
replaced chua with lên men
replaced vay with vay mượn
replaced ong with đàn ong
replaced cai with cai nghiện
replaced hai with đôi
replaced am with thất
replaced tham with hoài bão
replaced quan with quan lại
replaced bao with bao quanh
replaced xoa with láng
replaced bao with bao quanh
replaced da with da động vật
replaced am with thất
replaced moi with cố lấy được
replaced co with rút
replaced tham with dã tâm
replaced chua with b

In [None]:
# Filter data that need to augment

# Filter label 1,2 - train on ViHSD dataset

# Dataset analysis
import pandas as pd

DATA = '/content/train.csv'
DATA_HATE = '/content/data_label_1.csv'

data = pd.read_csv(DATA, index_col=False)

#label0 = data.loc[data['label_id']==0]
label1 = data.loc[data['label_id']==1]
#label2 = data.loc[data['label_id']==2]

#data_new = pd.concat([label1, label2])
data_new = pd.concat([label1])

print(data_new)

data_new.to_csv(DATA_HATE, header=False, index=False, sep="|")

                                               free_text  label_id
7                                      Lúp lúp như chó .         1
18     Dạy bơi cho cá. Bơi thì đương nhiên nó bơi đượ...         1
38              Ý thức còn ít hơn cả số tiền trong túi t         1
47                                           xxx video 🔞         1
58                                 Đấu khẩu - Chim lợn 👍         1
...                                                  ...       ...
23990                                              Vcl ạ         1
23993                                     Cái đb gì vậy?         1
23994                                Nhìn gớm và tởm vậy         1
24027  Đéo thấy thằng nào bị tử hình... vì các nghìn ...         1
24029                                  Mất hình tượng vl         1

[1606 rows x 2 columns]


HOW TO GET CC.VI.300

In [None]:
import gdown
import gzip

# Download the .gz file
gdown.download('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz', '/content/cc.vi.300.vec.gz')

# Unzip the file
gzip.open('/content/cc.vi.300.vec.gz', 'r')


Downloading...
From: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz
To: /content/cc.vi.300.vec.gz
100%|██████████| 1.24G/1.24G [00:47<00:00, 26.2MB/s]


<gzip _io.BufferedReader name='/content/cc.vi.300.vec.gz' 0x7b338dff5930>

In [None]:
import gzip

with gzip.open('/content/cc.vi.300.vec.gz', 'rb') as f_in:
    with open('/content/cc.vi.300.vec', 'wb') as f_out:
        f_out.write(f_in.read())

**Fasttext**

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4227140 sha256=425bf8eff936b54ca6048291eb2a59ea82f3291a6775d8a0dc717900997cf02b
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.12.0


In [None]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('/content/cc.vi.300.vec',binary=False)

In [None]:
# TRAIN
# CODE augmetation: https://github.com/jasonwei20/eda_nlp

import random
from random import shuffle

random.seed(1)
import json

# import nltk

# nltk.download('wordnet')

class Argument:
    input = "/content/data_label_2.csv"
    output = "/content/drive/MyDrive/new_dataset15.5/augmented_dataset_label_2_num8_0.05_textfast.txt"
    num_aug = 8
    alpha = 0.05


args = Argument()

# stop words list
stop_words = []
# with open("drive/My Drive/CODE/HSD/vietnamese-stopwords-dash.txt", "r") as f:
#     stop_words = []
#     for line in f:
#         dd = line.strip('\n')
#         stop_words.append(dd)

# cleaning up text
import re


def get_only_chars(line):
    # clean_line = ""

    # line = line.replace("’", "")
    # line = line.replace("'", "")
    # line = line.replace("-", " ") #replace hyphens with spaces
    # line = line.replace("\t", " ")
    # line = line.replace("\n", " ")
    # line = line.lower()

    # for char in line:
    #     if char in 'qwertyuiopasdfghjklzxcvbnm ':
    #         clean_line += char
    #     else:
    #         clean_line += ' '

    # clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    # if clean_line[0] == ' ':
    #     clean_line = clean_line[1:]
    # return clean_line
    return line


########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

# for the first time you use wordnet
# import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet


def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            print("replaced", random_word, "with", synonym)
            num_replaced += 1
        if num_replaced >= n:  # only replace up to n words
            break

    # this is stupid but we need it, trust me
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words


# def get_synonyms(word):
# 	synonyms = set()
# 	for syn in wordnet.synsets(word):
# 		for l in syn.lemmas():
# 			synonym = l.name().replace("_", " ").replace("-", " ").lower()
# 			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
# 			synonyms.add(synonym)
# 	if word in synonyms:
# 		synonyms.remove(word)
# 	return list(synonyms)

# def get_synonyms(word):
#     synonyms = set()
#     with open("drive/My Drive/CODE/HSD/word_net_vi.json", "r") as f:
#         wordnet = json.load(f)

#     for key, value in wordnet.items():
#         if key.strip() == word:
#             for v in value:
#                 synonyms.add(v.strip())

#         if word in synonyms:
#             synonyms.remove(word)
#     return list(synonyms)

def get_synonyms(word):
    synonyms = []
    sn = model.most_similar(word)

    for k in sn:
        synonyms.append(k[0])
    return synonyms


########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):
    # obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    # randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    # if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words) - 1)
        return [words[rand_int]]

    return new_words


########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        if len(new_words) > 0:
            new_words = swap_word(new_words)
    return new_words


def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words) - 1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words) - 1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words


########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words


def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1 and len(new_words) > 0:
    # while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words) - 1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return

    if len(new_words) > 0:
        random_synonym = synonyms[0]
        random_idx = random.randint(0, len(new_words) - 1)
        new_words.insert(random_idx, random_synonym)


########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
    sentence = get_only_chars(sentence)
    words = sentence.split(' ')
    words = [word for word in words if word is not '']
    num_words = len(words)

    augmented_sentences = []

    if len(words) <= 0:
        return augmented_sentences
    num_new_per_technique = int(num_aug / 4) + 1
    n_sr = max(1, int(alpha_sr * num_words))
    n_ri = max(1, int(alpha_ri * num_words))
    n_rs = max(1, int(alpha_rs * num_words))

    # sr
    for _ in range(num_new_per_technique):
        a_words = synonym_replacement(words, n_sr)
        augmented_sentences.append(' '.join(a_words))

    # ri
    for _ in range(num_new_per_technique):
        a_words = random_insertion(words, n_ri)
        augmented_sentences.append(' '.join(a_words))

    # rs
    for _ in range(num_new_per_technique):
        a_words = random_swap(words, n_rs)
        augmented_sentences.append(' '.join(a_words))

    # rd
    for _ in range(num_new_per_technique):
        a_words = random_deletion(words, p_rd)
        augmented_sentences.append(' '.join(a_words))

    augmented_sentences = list(set(augmented_sentences))
    augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
    shuffle(augmented_sentences)

    # trim so that we have the desired number of augmented sentences
    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

    # append the original sentence
    augmented_sentences.append(sentence)

    return augmented_sentences


# the output file
output = None
if args.output:
    output = args.output
else:
    from os.path import dirname, basename, join

    output = join(dirname(args.input), 'eda_' + basename(args.input))

# number of augmented sentences to generate per original sentence
num_aug = 9  # default
if args.num_aug:
    num_aug = args.num_aug

# how much to change each sentence
alpha = 0.1  # default
if args.alpha:
    alpha = args.alpha


# generate more data with standard augmentation
def gen_eda(train_orig, output_file, alpha, num_aug=9):
    try:
        writer = open(output_file, 'w')
        lines = open(train_orig, 'r').readlines()

        writer.write("free_text" + "," + "label_id" + '\n')
        augm = ""
        for i, line in enumerate(lines):
            try:
                parts = line[:-1].split('|')
                # print(parts)
                # sen_id = parts[0]
                label = parts[1]
                sentence = parts[0]
                aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
                for aug_sentence in aug_sentences:
                    # writer.write(label + "\t" + aug_sentence + '\n')
                    # writer.write(sen_id + "," +aug_sentence + "," + label + '\n')
                    augm = augm + aug_sentence + "," + label + '\n'
            except Exception as e:
                print(e)
                print(parts)
                pass

        writer.write(augm)
        writer.close()
        print(
            "generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(
                num_aug))
    except Exception as e:
        raise e
        pass


# main function
if __name__ == "__main__":
    # generate augmented sentences and output into a new file
    gen_eda(args.input, args.output, alpha=alpha, num_aug=num_aug)

  words = [word for word in words if word is not '']


[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
replaced già with già
"Key ':))' not present in vocabulary"
['ông già hay xaolon phết Tùng sói said :))', '2']
"Key ':))))' not present in vocabulary"
['Hải Yến Đ care cmt làm gì??? :))) Ô ai hỏi m à? :))))', '2']
replaced vs with Vs
replaced về with vè
replaced Đất with SétLễ
replaced này with nào
replaced đang with muốn
replaced mẹ with bố
replaced bạn with Hãy
replaced ad with Dajla
replaced 400 with 350
"Key 'bài?' not present in vocabulary"
['Dlv bữa nay hoạt động về mặt kinh tế nữa à. Hay đi nhầm bài?', '2']
replaced ông with Ông
replaced ở with cả
replaced coi with như
replaced dao with rựa
replaced ma with qủi
replaced do with ,
replaced thuyết with thuyê
replaced phục with phuc
replaced xl with lq
replaced ủa with ừa
"Key '😂' not present in vocabulary"
['My Trần ủa ụ á 😂', '2']
replaced chủ with Chủ
replaced nhà with toà
replaced chửi with Chửi
replaced xạo with Bậy
replaced Thái with Lan•
replaced Trọng

KeyboardInterrupt: 

# Results and concat with original

In [None]:
 !pip install pandas==1.4

Collecting pandas==1.4
  Downloading pandas-1.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.0.3
    Uninstalling pandas-2.0.3:
      Successfully uninstalled pandas-2.0.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.6.0 requires pandas>=1.5.0, but you have pandas 1.4.0 which is incompatible.
cudf-cu12 24.4.1 requires pandas<2.2.2dev0,>=2.0, but you have pandas 1.4.0 which is incompatible.
google-colab 1.0.0 requires pandas==2.0.3, but you have pandas 1.4.0 which is incompatible.
plotnine 0.12.4 requires pandas>=1.5.0, but you have pandas 1.4.0 which is incompatible.[0m[31m
[0mSuccessf

In [None]:
# TRAIN
# concat train original with original

import pandas as pd

DATA = '/content/train.csv'
DATA_AUG = '/content/augmented_d3232ataset(6).txt'

DATA_AUG_FINAL = '/content/train_augme3232nted_dataset(1).csv'

data = pd.read_csv(DATA, index_col=False)
data_hate = pd.read_csv(DATA_AUG, index_col=False,error_bad_lines=False)
#, error_bad_lines=False
#data_hate = data_hate.iloc[: , 1:]
data_hate.drop_duplicates(subset ="free_text", keep = False, inplace = True)

data_aug = pd.concat([data, data_hate])

data_aug.to_csv(DATA_AUG_FINAL, index=False)

In [None]:
# Extra augmentation
import pandas as pd

DATA = '/content/train.csv'

DATA_AUG = '/content/drive/MyDrive/new_dataset15.5/augmented_dataset_label_1_num8_0.05_wordnet.txt'
DATA_AUG_2 = '/content/drive/MyDrive/new_dataset15.5/augmented_dataset_label_2_num8_0.15_wordnet.txt'

DATA_AUG_FINAL = '/content/train_augmented_dataset_label12_num8_0.15_wordnet.csv'

data_hate = pd.read_csv(DATA_AUG, index_col=False, error_bad_lines=False)
#, error_bad_lines=False
data_hate_2 = pd.read_csv(DATA_AUG_2, index_col=False, error_bad_lines=False)
data_hate_final = pd.concat([data_hate, data_hate_2])
data_hate_final.drop_duplicates(subset ="free_text", keep = False, inplace = True)

data = pd.read_csv(DATA, index_col=False)
data_aug = pd.concat([data, data_hate_final])

data_aug.to_csv(DATA_AUG_FINAL, index=False)



  data_hate = pd.read_csv(DATA_AUG, index_col=False, error_bad_lines=False)
b'Skipping line 93: expected 2 fields, saw 6\nSkipping line 94: expected 2 fields, saw 6\nSkipping line 95: expected 2 fields, saw 6\nSkipping line 96: expected 2 fields, saw 6\nSkipping line 97: expected 2 fields, saw 6\nSkipping line 98: expected 2 fields, saw 6\nSkipping line 99: expected 2 fields, saw 5\nSkipping line 100: expected 2 fields, saw 6\nSkipping line 127: expected 2 fields, saw 7\nSkipping line 128: expected 2 fields, saw 7\nSkipping line 129: expected 2 fields, saw 4\nSkipping line 130: expected 2 fields, saw 7\nSkipping line 131: expected 2 fields, saw 7\nSkipping line 132: expected 2 fields, saw 7\nSkipping line 133: expected 2 fields, saw 7\nSkipping line 134: expected 2 fields, saw 7\nSkipping line 187: expected 2 fields, saw 3\nSkipping line 188: expected 2 fields, saw 3\nSkipping line 189: expected 2 fields, saw 3\nSkipping line 190: expected 2 fields, saw 3\nSkipping line 191: expected

In [None]:
import pandas as pd
test = '/content/merged_file.csv'
DATA = pd.read_csv(test,index_col=False)
DATA['free_text'] = DATA['free_text'].str.replace('_', ' ')
DATA.to_csv("/content/final.csv",index=False)