# Imports

In [1]:
!pip install accelerate -U



In [2]:
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amirghavam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amirghavam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/amirghavam/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
from sklearn.metrics import accuracy_score
from keywords_extractor import KeywordsExtractor
from augmenter import TextAugmenter
import math

In [4]:
KE = KeywordsExtractor(lang='en')

Language: English
Loading word vectors......


In [5]:
data = pd.read_csv('data/newsgroups/newsgroups_1000.csv')
data = data.dropna()
contents = list(data['text'])
labels = [str(label) for label in data['label']]

In [6]:
# extract keywords
kws_dict = KE.global_role_kws_extraction_one_line(contents, labels, output_dir='saved_keywords',name='newsgroups')
kws_dict.keys()

100%|██████████| 1000/1000 [00:00<00:00, 1368.84it/s]
100%|██████████| 2688/2688 [03:57<00:00, 11.33it/s]
100%|██████████| 2839/2839 [03:53<00:00, 12.18it/s]
100%|██████████| 4206/4206 [05:44<00:00, 12.21it/s]
100%|██████████| 2517/2517 [03:32<00:00, 11.84it/s]
100%|██████████| 2637/2637 [03:49<00:00, 11.50it/s]
100%|██████████| 3906/3906 [05:10<00:00, 12.56it/s]
100%|██████████| 3082/3082 [04:19<00:00, 11.85it/s]
100%|██████████| 2384/2384 [03:18<00:00, 12.02it/s]
100%|██████████| 3899/3899 [05:01<00:00, 12.95it/s]
100%|██████████| 2475/2475 [03:25<00:00, 12.04it/s]
100%|██████████| 3363/3363 [04:39<00:00, 12.02it/s]
100%|██████████| 3175/3175 [04:11<00:00, 12.64it/s]
100%|██████████| 2598/2598 [03:22<00:00, 12.81it/s]
100%|██████████| 2779/2779 [03:33<00:00, 13.00it/s]
100%|██████████| 3215/3215 [04:55<00:00, 10.86it/s]
100%|██████████| 3745/3745 [05:18<00:00, 11.78it/s]
100%|██████████| 5172/5172 [06:37<00:00, 13.01it/s]
100%|██████████| 3513/3513 [05:23<00:00, 10.84it/s]
100%|█████

First level keys:  ['rec.sport.hockey', 'rec.sport.baseball', 'talk.politics.mideast', 'comp.sys.ibm.pc.hardware', 'misc.forsale', 'sci.med', 'sci.electronics', 'comp.sys.mac.hardware', 'talk.politics.guns', 'rec.motorcycles', 'rec.autos', 'sci.crypt', 'talk.religion.misc', 'alt.atheism', 'comp.os.ms-windows.misc', 'sci.space', 'talk.politics.misc', 'comp.windows.x', 'comp.graphics', 'soc.religion.christian']
Second level keys:  ['lr', 'ls', 'ccw', 'scw', 'fcw', 'iw']
already saved at saved_keywords/global_kws_dict_newsgroups.pkl


dict_keys(['global_ls', 'global_lr', 'global_roles'])

In [7]:
for key in kws_dict['global_roles']:
    print(f"keywords for \"{key}\":")
    for each in ['ccw','scw','fcw','iw']:
        print(f"{each}: {kws_dict['global_roles'][key][each][:10]}")

keywords for "rec.sport.hockey":
ccw: ['olchowy', 'golchowyalchemychemutorontoca', 'terresterial', 'stplistsca', 'smythe', 'recsporthockey', 'lindros', 'dchhabrastplistsca', 'wfan', 'weedville']
scw: ['a', 'and', 'nntppostinghost', 'twentytwo', 'oneone', 'xnewsreader', 'thirtythree', 'twentyseven', 'thirtyfour', 'twentyeight']
fcw: ['nhl', 'pens', 'playoff', 'flyers', 'leafs', 'penguins', 'islanders', 'norris', 'canadiens', 'maple']
iw: ['i', 'thanx', 'ltd', 'distribution', 'ken', 'w', 'lee', 'na', 'bob', 'jim']
keywords for "rec.sport.baseball":
ccw: ['alomar', 'tedwardcscornelledu', 'threeb', 'slg', 'sandberg', 'carew', 'yount', 'wetteland', 'vbthirtylafibmlafayetteedu', 'vbthirty']
scw: ['a', 'twentytwo', 'thirtythree', 'of', 'fiftysix', 'eightyfour', 'thirtyone', 'thirtysix', 'seventythree', 'eightythree']
fcw: ['pitching', 'cubs', 'phillies', 'rbis', 'dl', 'rbi', 'obp', 'hitters', 'dodgers', 'clemens']
iw: ['k', 'apr', 'email', 'w', 'lee', 'stephen', 'ma', 'x', 'anderson', 'keywor

In [8]:
# Tokenize each sentence and compile into a single list
all_words = []
for sentence in contents:
    tokens = word_tokenize(sentence)
    all_words.extend(tokens)

# Optionally, you can convert the list to a set to remove duplicates
unique_words = set(all_words)

In [9]:
similar_words_dict = {}
for word in unique_words:
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    similar_words_dict[word] = list(synonyms)

In [10]:
similar_words_dict

{'nonjewish': [],
 'gdfive': [],
 'belligerency': ['belligerence', 'hostilities', 'belligerency'],
 'crown': ['crownwork',
  'pate',
  'pennant',
  'peak',
  'crown',
  'poll',
  'top',
  'crest',
  'summit',
  'Crown',
  'jacket',
  'jacket_crown',
  'cap',
  'coronate',
  'tip',
  'diadem',
  'treetop'],
 'wolfson': [],
 'vbthirty': [],
 'monitor': ['admonisher',
  'reminder',
  'Monitor',
  'proctor',
  'supervise',
  'monitor',
  'varan',
  'monitor_lizard',
  'monitoring_device'],
 'interview': ['consultation', 'question', 'audience', 'interview'],
 'schrievers': [],
 'sating': ['fill', 'sate', 'satiate', 'replete'],
 'smkuslcom': [],
 'mercifully': ['mercifully'],
 'fixingpredating': [],
 'implementation': ['carrying_out',
  'effectuation',
  'execution',
  'implementation'],
 'diary': ['journal', 'diary'],
 'transgression': ['evildoing', 'transgression'],
 'feature': ['feature_film',
  'sport',
  'feature_of_speech',
  'feature',
  'feature_article',
  'boast',
  'characteristic

In [11]:
with open('weights/en_similars_dict.pkl', 'wb') as f:
    pickle.dump(similar_words_dict, f)

In [12]:
TA = TextAugmenter(lang='en')

Language: English


# Random

In [13]:
contents[0]

'from rrnpocwruedu robert r novitskey\nsubject cyclone and tempest\narticleid usenetonepskavqtu\nreplyto rrnpocwruedu robert r novitskey\norganization case western reserve university cleveland oh usa\nlines ten\nnntppostinghost thorinscwruedu\n\n\ncould someone please post any info on these systems\n\nthanks\nbob\n \n \nrobert novitskey  pursuing women is similar to banging ones head\nrrnpocwruedu    against a wallwith less opportunity for reward \n \n'

In [14]:
sentence = contents[0]
p = 0.1
print(' '.join(TA.aug_by_deletion(text=sentence,p=p,mode='random')))
print(' '.join(TA.aug_by_replacement(text=sentence,p=p,mode='random')))
print(' '.join(TA.aug_by_insertion(text=sentence,p=p,mode='random')))
print(' '.join(TA.aug_by_swap(text=sentence,p=p,mode='random')))

from rrnpocwruedu robert r novitskey subject cyclone and usenetonepskavqtu replyto rrnpocwruedu r novitskey organization case western cleveland oh usa lines ten thorinscwruedu could someone please post any info on systems thanks bob robert novitskey pursuing women is similar to banging ones head rrnpocwruedu against a wallwith less opportunity for
from rrnpocwruedu robert roentgen novitskey subject cyclone and storm articleid usenetonepskavqtu replyto rrnpocwruedu robert roentgen novitskey organization incase western reserve university President_Cleveland oh usa lines ten nntppostinghost thorinscwruedu could someone please post any info on these systems thanks bob robert novitskey act_on women is similar to banging ones head rrnpocwruedu against a wallwith less opportunity for reward
from rrnpocwruedu robert please r novitskey subject cyclone and tempest articleid usenetonepskavqtu replyto rrnpocwruedu robert r novitskey organization case western reserve university cleveland oh usa lin

# Selective

In [15]:
print(' '.join(TA.aug_by_deletion(text=sentence,p=p,mode='selective',selected_words=['comedy','election'])))
print(' '.join(TA.aug_by_replacement(text=sentence,p=p,mode='selective',selected_words=['comedy','election'])))
print(' '.join(TA.aug_by_insertion(text=sentence,p=p,mode='selective',selected_words=['comedy','election'])))
print(' '.join(TA.aug_by_swap(text=sentence,p=p,mode='selective',selected_words=['comedy','election'])))
print(' '.join(TA.aug_by_selection(text=sentence, selected_words=['comedy','election'])))

from rrnpocwruedu robert r novitskey subject cyclone and tempest articleid usenetonepskavqtu replyto rrnpocwruedu robert r novitskey organization case western reserve university cleveland oh usa lines ten nntppostinghost thorinscwruedu could someone please post any info on these systems thanks bob robert novitskey pursuing women is similar to banging ones head rrnpocwruedu against a wallwith less opportunity for reward
from rrnpocwruedu Robert r novitskey subject cyclone and tempest articleid usenetonepskavqtu replyto rrnpocwruedu Robert r novitskey organization case western reserve university cleveland oh usa lines ten nntppostinghost thorinscwruedu could someone please post any info on these systems thanks bob Robert novitskey pursuing char is similar to banging one head rrnpocwruedu against a wallwith less opportunity for reward
from rrnpocwruedu robert r novitskey subject cyclone and tempest articleid situation usenetonepskavqtu replyto rrnpocwruedu robert r lawsuit novitskey organ

In [16]:
# read saved keywords
name = 'sst2'
global_kws_dict_path = f'saved_keywords/global_kws_dict_{name}.pkl'
with open(global_kws_dict_path, 'rb') as f:
    global_kws_dict = pickle.load(f)

In [17]:
global_kws_dict

{'1': {'lr': ['love',
   'wonderful',
   'style',
   'powerful',
   'wise',
   'whose',
   'ways',
   'touching',
   'solid',
   'power',
   'lot',
   'sweet',
   'summer',
   'satisfying',
   'pleasure',
   'nine',
   'man',
   'imaginative',
   'history',
   'him',
   'fare',
   'culture',
   'considerable',
   'cinema',
   'good',
   'worthy',
   'watchable',
   'warmth',
   'unexpected',
   'surprisingly',
   'stuart',
   'saving',
   'proves',
   'occasionally',
   'occasional',
   'modern',
   'manages',
   'lovely',
   'looks',
   'laughs',
   'inventive',
   'ingenious',
   'horrifying',
   'honest',
   'giving',
   'gentle',
   'find',
   'fantastic',
   'especially',
   'enjoyable',
   'cinematic',
   'chance',
   'capable',
   'brings',
   'atmosphere',
   'amusing',
   'fun',
   'yet',
   'yesterday',
   'yarn',
   'wow',
   'works',
   'wispy',
   'whether',
   'weighty',
   'vivid',
   'vibrant',
   'version',
   'verbal',
   'unique',
   'undeniable',
   'twentyth',
   '

In [18]:
punc_list = [w for w in ',.，。!?！？;；、']

In [19]:
for key in global_kws_dict:
    print(key)

1
0


In [20]:
for key in global_kws_dict:
    print(key)
    kws = global_kws_dict[key]
    print(' '.join(TA.aug_by_deletion(sentence, p, 'selective', print_info=True,
                    selected_words=kws['scw']+kws['fcw']+kws['iw'])))  # except ccw
    print(' '.join(TA.aug_by_replacement(sentence, p, 'selective', print_info=True,
                    selected_words=kws['scw']+kws['fcw']+kws['iw'])))  # except ccw
    print(' '.join(TA.aug_by_insertion(sentence, p, 'selective', print_info=True,
                    selected_words=kws['ccw']+kws['scw']+kws['iw'])))  # except ccw
    print(' '.join(TA.aug_by_selection(sentence, print_info=True,
                        selected_words=kws['ccw']+punc_list)))

1
deletion info: ['from', 'robert', 'case', 'could', 'any']
rrnpocwruedu r novitskey subject cyclone and tempest articleid usenetonepskavqtu replyto rrnpocwruedu robert r novitskey organization western reserve university cleveland oh usa lines ten nntppostinghost thorinscwruedu someone please post info on these systems thanks bob robert novitskey pursuing women is similar to banging ones head rrnpocwruedu against a wallwith less opportunity for reward
replacement info: [('head', 'forefront'), ('a', 'deoxyadenosine_monophosphate'), ('on', 'on'), ('less', 'LE'), ('robert', 'Henry_M._Robert')]
from rrnpocwruedu Henry_M._Robert r novitskey subject cyclone and tempest articleid usenetonepskavqtu replyto rrnpocwruedu Henry_M._Robert r novitskey organization case western reserve university cleveland oh usa lines ten nntppostinghost thorinscwruedu could someone please post any info on these systems thanks bob Henry_M._Robert novitskey pursuing women is similar to banging ones forefront rrnpocw