## data augmentation using gensim and word embedding

In [1]:
import os
import glob
import pickle
from collections import Counter
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import stanfordnlp

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

from sklearn.model_selection import StratifiedKFold

In [2]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [3]:
nlp = stanfordnlp.Pipeline(lang="id",use_gpu=False)

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\Andreas Chandra\\stanfordnlp_resources\\id_gsd_models\\id_gsd_tokenizer.pt', 'lang': 'id', 'shorthand': 'id_gsd', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': 'C:\\Users\\Andreas Chandra\\stanfordnlp_resources\\id_gsd_models\\id_gsd_tagger.pt', 'pretrain_path': 'C:\\Users\\Andreas Chandra\\stanfordnlp_resources\\id_gsd_models\\id_gsd.pretrain.pt', 'lang': 'id', 'shorthand': 'id_gsd', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': 'C:\\Users\\Andreas Chandra\\stanfordnlp_resources\\id_gsd_models\\id_gsd_lemmatizer.pt', 'lang': 'id', 'shorthand': 'id_gsd', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': 'C:\\Users\\Andreas Chandra\\stanfordnlp_resources\\id_gsd_models\\id_g

In [4]:
data_path = '../data'

In [5]:
d_cleaned = pickle.load(open(os.path.join(data_path, 'cleaned', 'all_data_cleaned.pkl'), 'rb'))

In [6]:
model = Word2Vec.load(os.path.join(data_path, "support", "idwiki_word2vec_200.model"))

In [7]:
d_train_a = d_cleaned['train_a']
d_train_b = d_cleaned['train_b']

d_dev_a = d_cleaned['dev_a']
d_dev_b = d_cleaned['dev_b']

In [8]:
d_train_master = d_train_a.append(d_train_b)
d_dev_master = d_dev_a.append(d_dev_b)

In [9]:
d_train_master.reset_index(drop=True, inplace=True)
d_dev_master.reset_index(drop=True, inplace=True)

In [10]:
def word_unique(df, label):
    text = " ".join(df[df['LABEL'] == label]['response_2'])
    
    text_stemmed = stemmer.stem(text)

    word_list = set(word_tokenize(text_stemmed))
    
    return word_list

In [11]:
def get_sentences(query, label):
    response_2 = d_train[(d_train.word_list_2.apply(lambda x: query in x)) & (d_train['LABEL'] == label)]["response_2"]
    
    return response_2

In [12]:
def generate_augmented(d_train):
    
    d_train_aug = d_train.copy()

    d_train['word_list_2'] = d_train.response_2.apply(word_tokenize)
    
    d_true = d_train[d_train["LABEL"] == 1]

    d_false = d_train[d_train["LABEL"] == 0]

    true_words = " ".join(d_true.response_2)
    false_words = " ".join(d_false.response_2)

    true_word_freq = Counter(word_tokenize(true_words))

    false_word_freq = Counter(word_tokenize(false_words))
    
    d_word_freq_true = pd.DataFrame(data = {'word': list(true_word_freq.keys()),
                                  'freq': list(true_word_freq.values())})

    d_word_freq_false = pd.DataFrame(data = {'word': list(false_word_freq.keys()),
                                  'freq': list(false_word_freq.values())})
    
    d_word_freq = pd.merge(d_word_freq_true, d_word_freq_false, how = 'outer', on = 'word', suffixes=('_true', '_false'))
    
    d_word_freq_nona = d_word_freq[d_word_freq.isna().sum(axis = 1) == 0]

    d_word_freq_nona['total'] = d_word_freq_nona.freq_true + d_word_freq_nona.freq_false

    d_word_freq_nona['selisih'] = d_word_freq_nona.freq_true - d_word_freq_nona.freq_false

    d_word_freq_nona.sort_values(by=['selisih', 'total'], ascending = False)

    d_word_more_true = d_word_freq_nona[d_word_freq_nona.selisih > 10].sort_values('selisih')
    d_word_more_false = d_word_freq_nona[d_word_freq_nona.selisih < 0].sort_values('selisih')
    
    for index in d_word_more_false.index:
        doc = nlp(d_word_more_false.loc[index, 'word'])
        for sent in doc.sentences:
            for word in sent.words:
                d_word_more_false.loc[index, 'upos'] = word.upos

    for index in d_word_more_true.index:
        doc = nlp(d_word_more_true.loc[index, 'word'])
        for sent in doc.sentences:
            for word in sent.words:
                d_word_more_true.loc[index, 'upos'] = word.upos

    d_word_true_selected = d_word_more_true[d_word_more_true.upos.isin(["NOUN", "VERB", "ADJ"])]

    d_word_false_selected = d_word_more_false[d_word_more_false.upos.isin(["NOUN", "VERB", "ADJ"])]

    d_word_true_selected.shape

    d_word_false_selected.shape
    
    label = 0
    for index in tqdm(d_word_false_selected.index):
        word_ = d_word_false_selected.loc[index, "word"];
        sentences = get_sentences(word_, label)
        if len(sentences):
            for response in sentences:
                try:
                    word_similar_list = model.wv.most_similar(word_)
                except:
                    word_similar_list = []

                if len(word_similar_list):
                    for word, prob in word_similar_list[:5]:
                        d_train_aug = d_train_aug.append({'response_2': response, 'LABEL': label}, ignore_index=True)
        else:
            print("query", word_)
    
    label = 1
    for index in tqdm(d_word_true_selected.index):
        word_ = d_word_true_selected.loc[index, "word"];
        sentences = get_sentences(word_, label)
        if len(sentences):
            for response in sentences:
                try:
                    word_similar_list = model.wv.most_similar(word_)
                except:
                    word_similar_list = []

                if len(word_similar_list):
                    for word, prob in word_similar_list[:5]:
        #                 print("word", word, "word_", word_)
        #                 print(response.replace(word_, word))
                        d_train_aug = d_train_aug.append({'response_2': response, 'LABEL': label}, ignore_index=True)
        else:
            print("query", word_)
    
    return d_train_aug

In [13]:
kf = StratifiedKFold(n_splits=5)

In [17]:
index = 0
for train, test in kf.split(d_train_master.response_2, d_train_master.LABEL):
    print("kfold", index)
    d_train = d_train_master.loc[train, ['RES_ID', 'RESPONSE', 'LABEL', 'word_list', 'response_2']]
    d_test = d_train_master.loc[test, ['RES_ID', 'RESPONSE', 'LABEL', 'word_list', 'response_2']]
    
    d_train.reset_index(drop=True, inplace=True)
    d_test.reset_index(drop=True, inplace=True)
    
    d_train_aug = generate_augmented(d_train)
    
    pickle.dump({'train': d_train_aug, 'test': d_test, 'dev': d_dev_master}, open('../data/cleaned/kfold/all_data_cleaned_augmented_false_kfold_{}.pkl'.format(index), 'wb'))
    
    index += 1

kfold 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████████████████████████████████████████████████████████████████████████████| 53/53 [00:07<00:00,  6.79it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:24<00:00,  1.81s/it]


kfold 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████████████████████████████████████████████████████████████████████████████| 54/54 [00:08<00:00, 10.62it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:23<00:00,  1.89s/it]


kfold 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [00:07<00:00,  9.36it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:26<00:00,  1.81s/it]


kfold 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████████████████████████████████████████████████████████████████████████████| 44/44 [00:08<00:00,  8.58it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:22<00:00,  1.81s/it]


kfold 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████████████████████████████████████████████████████████████████████████████| 33/33 [00:05<00:00,  6.23it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:22<00:00,  1.85s/it]
