# NLP Augmentation

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer
from tqdm.auto import tqdm
from gensim.test.utils import common_texts
import gensim
import re
from tqdm import tqdm

sys.path.insert(0, os.path.abspath(os.path.join('..')))
import nlp.sources.bigquery as nlpbq
import nlp.sources.data_sources as nlpcsv
import nlp.integration as nlpint
import nlp.augmentation as nlpaugmentation
from nlp.pt.dataset import TextDataset, DataCollator

In [2]:
sentences = ['Please could you explain my bill to me',
             'hi can you expalin what these extra charges are on my bill please £241.73 ???',
             'Hi I was just chatting with Saif but got cut off is he available?']

# Back-Translation

In [None]:
back_translator = nlpaugmentation.BackTranslation(language='de')
back_translator.augment(sentences)

# Synonyms Replacement

In [None]:
%%time
data = np.concatenate([pd.read_csv('input/data_triage.csv')['text'].values, pd.read_csv('input/data_upgrade.csv')['text'].values])
data = [ [x for x in re.sub( r'[\d]*','',re.sub(r'[,!?;-]+', '', str(s).lower().replace('.',''))).split(' ') if x!=''] for s in data]
model = gensim.models.Word2Vec(sentences=data, vector_size=300, window=10, min_count=1, workers=4)
model.wv.save_word2vec_format("input/word2vec.bin")

In [None]:
synonyms = nlpaugmentation.SynonymReplacement(embeddings_path='input/word2vec.bin', model_type='word2vec')
synonyms.augment(sentences)

# Synonyms Replacement Fast

In [3]:
%%time
synonyms_fast = nlpaugmentation.SynonymReplacementFast(data= np.concatenate([pd.read_csv('input/data_triage.csv')['text'].values, pd.read_csv('input/data_upgrade.csv')['text'].values]),
                                                       device='cpu', 
                                                       top_k=4, 
                                                       size=300,
                                                       aug_p=0.5)

Training a W2V model on 496279 texts
Wall time: 57.5 s


In [5]:
%%time
for i in tqdm(range(100)):
    batch = sentences * 10
    synonyms_fast.augment(batch)

100%|████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:23<00:00,  4.34it/s]

Wall time: 23.1 s





# One Of

In [None]:
oneof = nlpaugmentation.OneOf([back_translator, synonyms], [0.75])
oneof.augment(sentences)

# Sequence

In [None]:
seq = nlpaugmentation.Sequence([back_translator, synonyms])
seq.augment(sentences)

# Augmenter within DataLoader

In [None]:
from sklearn.datasets import fetch_20newsgroups
sentences = pd.read_csv('input/data_triage.csv')['text'].values[:4]
len(sentences)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
dataset = TextDataset(data=np.array(sentences),
                      labels=np.zeros(len(sentences)),
                      class_weights='auto',
                      device='cpu',
                      only_labelled=True)
collate_fn = DataCollator(tokenizer, nlp = None, tag2id = None, ner=False, max_length=40, augmenter = synonyms_fast)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=16, collate_fn=collate_fn, num_workers=4, shuffle=False)

In [None]:
for x in tqdm(train_loader):
    pass

In [None]:
tokenizer.batch_decode(x['x']['input_ids'], skip_special_tokens=True)

In [None]:
sentences