# Imports

In [1]:
!pip install accelerate -U



In [2]:
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amirghavam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amirghavam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/amirghavam/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
from sklearn.metrics import accuracy_score
from keywords_extractor import KeywordsExtractor
from augmenter import TextAugmenter

In [4]:
KE = KeywordsExtractor(lang='en')

Language: English
Loading word vectors......


In [5]:
data = pd.read_csv('data/sst2/sst2_1000.csv')
data = data.dropna()
contents = list(data['text'])
labels = [str(label) for label in data['label']]

In [6]:
# extract keywords
kws_dict = KE.global_role_kws_extraction_one_line(contents, labels, output_dir='saved_keywords',name='sst2')
kws_dict.keys()

100%|██████████| 1000/1000 [00:00<00:00, 13560.24it/s]
100%|██████████| 1799/1799 [00:09<00:00, 192.01it/s]
100%|██████████| 1666/1666 [00:09<00:00, 179.26it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14754.08it/s]

First level keys:  ['1', '0']
Second level keys:  ['lr', 'ls', 'ccw', 'scw', 'fcw', 'iw']
already saved at saved_keywords/global_kws_dict_sst2.pkl





dict_keys(['global_ls', 'global_lr', 'global_roles'])

In [7]:
for key in kws_dict['global_roles']:
    print(f"keywords for \"{key}\":")
    for each in ['ccw','scw','fcw','iw']:
        print(f"{each}: {kws_dict['global_roles'][key][each][:10]}")

keywords for "1":
ccw: ['nine', 'stuart', 'solid', 'each', 'him', 'twentyth', 'spielberg', 'selfglorification', 'scoobydoo', 'raphael']
scw: ['three', 'third', 'second', 'to', 'nearly', 'midnight', 'the', 'of', 'half', 'than']
fcw: ['wonderful', 'style', 'culture', 'ways', 'ingenious', 'whose', 'occasionally', 'sly', 'manner', 'amusing']
iw: ['clever', 'tastelessness', 'elegantly', 'shrewd', 'ennui', 'director', 'callow', 'audience', 'wit', 'moments']
keywords for "0":
ccw: ['forgettable', 'series', 'lack', 'flat', 'before', 'combined', 'run', 'four', 'despite', 'worst']
scw: ['score', 'half', 'minutes', 'three', 'sandra', 'third', 'i', 'joel', 'pace', 'just']
fcw: ['familiar', 'loud', 'becomes', 'slimed', 'to', 'gags', 'opera', 'sit', 'merit', 'ego']
iw: ['and', 'one', 'ennui', 'humor', 'work', 'most', 'world', 'independent', 'act', 'might']


In [8]:
# Tokenize each sentence and compile into a single list
all_words = []
for sentence in contents:
    tokens = word_tokenize(sentence)
    all_words.extend(tokens)

# Optionally, you can convert the list to a set to remove duplicates
unique_words = set(all_words)

In [9]:
similar_words_dict = {}
for word in unique_words:
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    similar_words_dict[word] = list(synonyms)

In [10]:
similar_words_dict

{'big': ['braggy',
  'big',
  'handsome',
  'crowing',
  'bragging',
  'self-aggrandising',
  'giving',
  'braggart',
  'bighearted',
  'swelled',
  'adult',
  'grown',
  'large',
  'heavy',
  'fully_grown',
  'full-grown',
  'boastfully',
  'self-aggrandizing',
  'vauntingly',
  'bounteous',
  'gravid',
  'great',
  'cock-a-hoop',
  'bountiful',
  'with_child',
  'bad',
  'expectant',
  'vainglorious',
  'boastful',
  'magnanimous',
  'grownup',
  'openhanded',
  'prominent',
  'freehanded',
  'liberal',
  'enceinte'],
 'learn': ['find_out',
  'acquire',
  'learn',
  'get_wind',
  'get_a_line',
  'see',
  'read',
  'larn',
  'memorize',
  'instruct',
  'ascertain',
  'teach',
  'watch',
  'determine',
  'hear',
  'discover',
  'memorise',
  'con',
  'study',
  'check',
  'take',
  'pick_up',
  'get_word'],
 'shared': ['divided_up',
  'share',
  'divvy_up',
  'shared',
  'apportion',
  'partake_in',
  'shared_out',
  'partake',
  'portion_out',
  'divided',
  'deal'],
 'sugarman': [],


In [11]:
with open('weights/en_similars_dict.pkl', 'wb') as f:
    pickle.dump(similar_words_dict, f)

In [12]:
TA = TextAugmenter(lang='en')

Language: English


# Random

In [13]:
contents[0]

'klein  charming in comedies like american pie and deadon in election  '

In [14]:
sentence = contents[0]
p = 0.1
print(' '.join(TA.aug_by_deletion(text=sentence,p=p,mode='random')))
print(' '.join(TA.aug_by_replacement(text=sentence,p=p,mode='random')))
print(' '.join(TA.aug_by_insertion(text=sentence,p=p,mode='random')))
print(' '.join(TA.aug_by_swap(text=sentence,p=p,mode='random')))

klein charming in comedies like american pie and election
Felix_Klein charming in comedies like american pie and deadon in election
klein American charming in comedies like american pie and deadon in election
election charming in comedies like american pie and deadon in klein


# Selective

In [21]:
print(' '.join(TA.aug_by_deletion(text=sentence,p=p,mode='selective',selected_words=['comedy','election'])))
print(' '.join(TA.aug_by_replacement(text=sentence,p=p,mode='selective',selected_words=['comedy','election'])))
print(' '.join(TA.aug_by_insertion(text=sentence,p=p,mode='selective',selected_words=['comedy','election'])))
print(' '.join(TA.aug_by_swap(text=sentence,p=p,mode='selective',selected_words=['comedy','election'])))
print(' '.join(TA.aug_by_selection(text=sentence, selected_words=['comedy','election'])))

klein charming in comedies like american pie and deadon in
klein charming in comedies like american pie and deadon in election
klein charming in comedies like american pie and election deadon in election
klein charming in comedies election american pie and deadon in like
election


In [16]:
# read saved keywords
name = 'sst2'
global_kws_dict_path = f'saved_keywords/global_kws_dict_{name}.pkl'
with open(global_kws_dict_path, 'rb') as f:
    global_kws_dict = pickle.load(f)

In [17]:
global_kws_dict

{'1': {'lr': ['love',
   'wonderful',
   'style',
   'powerful',
   'wise',
   'whose',
   'ways',
   'touching',
   'solid',
   'power',
   'lot',
   'sweet',
   'summer',
   'satisfying',
   'pleasure',
   'nine',
   'man',
   'imaginative',
   'history',
   'him',
   'fare',
   'culture',
   'considerable',
   'cinema',
   'good',
   'worthy',
   'watchable',
   'warmth',
   'unexpected',
   'surprisingly',
   'stuart',
   'saving',
   'proves',
   'occasionally',
   'occasional',
   'modern',
   'manages',
   'lovely',
   'looks',
   'laughs',
   'inventive',
   'ingenious',
   'horrifying',
   'honest',
   'giving',
   'gentle',
   'find',
   'fantastic',
   'especially',
   'enjoyable',
   'cinematic',
   'chance',
   'capable',
   'brings',
   'atmosphere',
   'amusing',
   'fun',
   'yet',
   'yesterday',
   'yarn',
   'wow',
   'works',
   'wispy',
   'whether',
   'weighty',
   'vivid',
   'vibrant',
   'version',
   'verbal',
   'unique',
   'undeniable',
   'twentyth',
   '

In [18]:
punc_list = [w for w in ',.，。!?！？;；、']

In [19]:
for key in global_kws_dict:
    print(key)

1
0


In [20]:
for key in global_kws_dict:
    print(key)
    kws = global_kws_dict[key]
    print(' '.join(TA.aug_by_deletion(sentence, p, 'selective', print_info=True,
                    selected_words=kws['scw']+kws['fcw']+kws['iw'])))  # except ccw
    print(' '.join(TA.aug_by_replacement(sentence, p, 'selective', print_info=True,
                    selected_words=kws['scw']+kws['fcw']+kws['iw'])))  # except ccw
    print(' '.join(TA.aug_by_insertion(sentence, p, 'selective', print_info=True,
                    selected_words=kws['ccw']+kws['scw']+kws['iw'])))  # except ccw
    print(' '.join(TA.aug_by_selection(sentence, print_info=True,
                        selected_words=kws['ccw']+punc_list)))

1
deletion info: ['charming']
klein in comedies like american pie and deadon in election
replacement info: [('charming', 'wizardly')]
klein wizardly in comedies like american pie and deadon in election
insertion info: [('pie', 'PIE')]
klein charming in comedies like american pie and deadon in PIE election
selection info: pie
selection info: and
selection info: deadon
pie and deadon
0
deletion info: ['in']
klein charming comedies like american pie and deadon in election
replacement info: [('american', 'American')]
klein charming in comedies like American pie and deadon in election
insertion info: [('in', 'inch')]
klein inch charming in comedies like american pie and deadon in election

