In [1]:
#Mounting the drive content that contains the necessary files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Retrieving the pretrained Glove word embeddings which are pretrained on:
#Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, 100d)
#Source: https://nlp.stanford.edu/projects/glove/

!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2022-12-01 21:14:48--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-12-01 21:14:48--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-12-01 21:17:27 (5.19 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [3]:
#importing the necessary modules
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords,wordnet
import string
import numpy as np
import re
import random
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [4]:
#obtaining the list of stop words in english
stop_words = set(stopwords.words('english'))

#storing the word embeddings as a dictionary
embeddings={}
f_glove=open('/content/glove.6B.100d.txt', 'r',encoding="utf8")

for line in f_glove:
    entries=line.split()
    wordvector = np.asarray(entries[1:], "float32")
    embeddings[entries[0]]=wordvector
f_glove.close()

In [5]:
#function that determines the most similar word among the ones collected from the WordNet interface via synonyms() function using the GloVe word embeddings
def mostsimilar(embeddings,word,candidates):
    scores=[]
    cand_score=[]
    check=0
    for element in candidates:
        cand=element.lower()
        #if the candidate contains a "-" or "_" character:
        if (len(cand.split('_'))>1) or (len(cand.split('-'))>1):
            count=0
            sum=0
            #it is replaced by a space character
            cand=cand.replace('_',' ').replace('-',' ')
            #For each part that the candidate is composed of:
            for part in cand.split(' '):
                #the word embedding is found if it exists
                try:
                    sum+=embeddings[part]
                    count+=1
                except:
                    continue
            #if no embedding was found none of the parts the procedure returns the first element on the candidate list given as input
            if (type(sum)==int):
                continue
            else:
                #if there is an existing embedding then the average of embedding that belongs to the part are found and serves as a score
                try:
                  embedding_cand=sum/count
                  cos=(np.dot(embedding_cand,embeddings[word])/np.linalg.norm(embedding_cand))/np.linalg.norm(embeddings[word])
                  scores.append(cos)
                  cand_score.append(element)
                  check+=1
                except:
                  continue
        else:
            #the same procedure
            try:
                embedding_cand=embeddings[cand]
                cos=(np.dot(embedding_cand,embeddings[word])/np.linalg.norm(embedding_cand))/np.linalg.norm(embeddings[word])
                scores.append(cos)
                cand_score.append(element)
                check+=1
            except:
                continue
    #if no embedding was found:
    if len(cand_score)==0:
        return wn[0]
    #scores and the corresponding candidates are merged together and resultant list is sorted
    s=np.array([np.array(cand_score),np.array(scores)],dtype=object)
    s=np.transpose(s)
    s=s[np.argsort(s[:,1],axis=0),:][::-1]   
    if '_' in s[0][0]:
        s[0][0]=s[0][0].replace('_',' ')
    #first element that has the highest score is returned                 
    return s[0][0]

In [6]:
#function that collects the candidates that are similar to the input word
def synonyms(wordgiven, ndesired):
    #collecting the synsets from the WordNet interface
    wn=wordnet.synsets(wordgiven)
    words=[]
    taglist=[]
    #obtaning the POS tag of the word in process
    wordtag=nltk.pos_tag([wordgiven])[0][1][0]
    check=1
    if len(wn)!=0:
        for word in wn:
            #performing a POS compatibility check on the synset level that result in a prioritization tag
            for entry in word.lemmas():
                if wordtag=='N' and str(entry).split('.')[1]=='n':
                    priortag=1
                elif (wordtag in ['J']) and (str(entry).split('.')[1] in ['a','s']):
                    priortag=1
                elif (wordtag in ['R']) and (str(entry).split('.')[1] in ['r']):
                    priortag=1
                else:
                    priortag=0
                if (wordgiven.lower()!=entry.name().lower()) and (entry.name().lower() not in words):
                    words.append(entry.name().lower()) #list that contains the results
                    taglist.append(priortag) #list that contains the prioritization tags
    #reordering of the results in the list based on their prioritization tag
    wordsfinal=[None]*len(words)
    if len(words)!=0:
        indexstart=taglist.count(1)
        count1=0
        for i in range(0,len(words)):
            if taglist[i]==1:
                wordsfinal[count1]=words[i]
                count1+=1
            else:
                wordsfinal[indexstart]=words[i]
                indexstart+=1
        check=1
    else:
        check=0
    return wordsfinal[0:ndesired],check  

In [7]:
#function that cleans the lines from undesired characters as tags,punctuation signs etc.
def cleanline(line):
    tagpattern=r'(<)(.+?)(>)'
    numberpattern=r'([0-9]+)'
    cl=re.sub(tagpattern,'',line.lower()).replace('\n','').replace('“','').replace('”','').replace('…','').replace('—',' ').replace('‘','').replace('’','').replace('\\' , ' ').replace('/',' ')
    cl=re.sub(numberpattern,' ',cl)
    for char in cl:
            if (char in string.punctuation):
                cl=cl.replace(char,' ')
    return cl

In [8]:
#path of the file that contains the maxims to be augmented
TRAINSETPATHX='/content/drive/MyDrive/Maximsamples.txt'

f=open(TRAINSETPATHX,'r',encoding='utf8')
maximbits=f.readlines()
f.close()
#removing the endline characters
maximbits=[maxim[:-1] for maxim in maximbits]

print("The number of maxims: {}".format(len(maximbits)))

The number of maxims: 10


In [9]:
maximbits

['Lastly the rules which must make up the reference system should be identified according to objective criteria in particular to enable judicial review of the assessments on which that identification is based. It is for the Commission to take into account any factors put forward by the Member State concerned and more generally to carry out its examination in a rigorous and sufficiently reasoned manner in order to enable full judicial review.',
 'Second it is for the Commission when it approves a general aid scheme to take the necessary measures to ensure that the sectoral rules other than the general competition rules will be complied with by the Member State concerned.',
 'Having regard to the foregoing considerations the answer to be given to the first question must be that national measures which provide for a rebate of energy taxes on natural gas and electricity only in the case of undertakings whose activity is shown to consist primarily in the manufacture of goods must be regarde

In [10]:
#breaking th maxim lines into sentences
sentences=[]
nsentence=[] #storing the number of sentences each maxim contains
for line in maximbits:
    count=0
    #splitting sentences using ". " characters
    #listsent=line.replace('\\\n','').split('. ')
    listsent=line.split('. ')
    for i in range(0,len(listsent)):
        sentence=listsent[i]
        #sentences that are too short are eliminated
        if len(sentence)<5:
            continue
        elif (i==len(listsent)-1):
            #the last one is added as it is since we used the character sequence ". " for the seperation
            #and the last sentence contains only "."
            sentences.append(sentence)
            count=count+1
            continue
        else:
            #the "." characters are restored
            if(sentence[-1]!='.'):
                sentence=sentence+'.'
            sentences.append(sentence)
            count=count+1
    nsentence.append(count)

In [11]:
print(len(sentences))
print(len(nsentence))

14
10


In [12]:
perfix=[0.6] #percentage desired
noisesent6=[]
for sent in sentences:
    print(sentences.index(sent))
    #tokinizing and capitalizing the sentences
    tokens = nltk.word_tokenize(cleanline(sent).capitalize())
    #lower-casing the tokens
    tokens=[token.lower() for token in tokens]
    #eliminating the stop words
    nonstoptoken = [word for word in tokens if not word in stop_words] 
    tags = nltk.pos_tag(nonstoptoken)
    #collecting the tokens with POS tag in categories nouns, adjectives and adverbs
    tochange=[tag for tag in tags if tag[1][0] in ['J','N','R']]
    nchange=[round(per*len(tochange)) for per in perfix] #number of required replacements due to the percentage given
    #print("Len to change: {}".format(len(tochange)))
    #print("Nchange: {}".format(nchange[0]))
    linemodtotal=sent.lower()
    indicator=sent[0].isupper()
    changed=0
    #adding the sentences into list if no change is supposed to be performed
    if nchange[0]==0:
        if (indicator):
            toadd=linemodtotal.capitalize()
        else:
            toadd=linemodtotal
        noisesent6.append(toadd)
    index=-1
    #the replacement order of the tokens could also be randomized by making the following line uncommented
    #random.shuffle(tochange)
    while(changed<nchange[-1]):
        index+=1
        if(index==len(tochange)):
            if changed<nchange[0]:
              if (indicator):
                  toadd=linemodtotal.replace('#','').capitalize()
              else:
                  toadd=linemodtotal.replace('#','')
              noisesent6.append(toadd)
              #print(toadd)
            break
        word=tochange[index][0]
        #print("Word to change: {}".format(word))
        #Obtaining the candidates for the replacement
        [wn,check2]=synonyms(tochange[index][0],10)
        #if the list is not empty:
        if (check2):
            toreplace=mostsimilar(embeddings,word,wn)
            #print("TOREPLACE: {}".format(toreplace))
            #word is replaced with the candidate accompanying "#" characters as boundaries
            #this is done to prevent the replacement of the in-word and multiple occurences of the words in process 
            pattern='(?<![a-zA-Z#])'+word+'(?![a-zA-Z#])'
            linemodtotal=re.sub(pattern,'#'+toreplace+'#',linemodtotal,1)
            #print(linemodtotal)
            changed+=1
        else:
            continue
        #if the desired number of words are replaced
        if changed==nchange[0]:
            #indicator serves as a sign of the capitalized sentence
            #the "#" characters are removed at the end
            if (indicator):
                toadd=linemodtotal.replace('#','').capitalize()
            else:
                toadd=linemodtotal.replace('#','')
            #the augmented sentences added to the list defined at the beginning
            noisesent6.append(toadd)
            break

0
1
2
3
4
5
6
7
8
9
10
11
12
13


In [13]:
#the path where the document containing the augmented samples will be saved to
FINALPATH="/content/"

noiselist=[noisesent6]
for element in noiselist:
    folder="0.6" #Could also bi iterated due to percentages. Here we use only one percentage value therefore it is strictly defined 
    final=[]
    start=0
    for i in range(0,len(nsentence)):
        toaddm=''
        n=nsentence[i]
        #merging the sentences that are contained in the same maxim
        for j in range(start,start+n):
            toaddm=toaddm+' '+element[j]
        toaddm=toaddm[1:]
        final.append(toaddm)
        start=start+n
    #saving the augmented samples into a text document in the given final path
    f=open(FINALPATH+'/maximsamples_wordnetglove'+folder[-1]+'0.txt', 'w',encoding="utf8")
    for line in final:
        f.write(line+'\n')
    f.close()