# Sampling and testing the augmentation module

This notebook is designed to verify the data generated for specific augmentation techniques, such as the synonym dictionary used for synonym replacement. 

Additionally, the notebook contains code to test actual usage of functions that will be implemented in the data augmentation module.

## Imports and initializations

In [7]:
# Importing the required libraries and packages
import nltk
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import translators as ts
import translators.server as tss
from IPython.display import clear_output

In [8]:
# Downloading the required NLTK resources and Spacy models
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Loading the synonyms into a pandas dataframe
synonyms = pq.read_table('../data_augmentation_GASPLN/data/synonyms_pt_BR.parquet').to_pandas()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\artur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\artur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\artur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\artur\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Visualizing the data

In [9]:
print('Number of words: ' + str(len(synonyms)))
print('Number of words with each letter of the alphabet:')

alphabet = 'abcdefghijklmnopqrstuvwxyzáàâãéèêíìóòôõúùç'

# Count the number of words that starts with each letter of the alphabet, store the result in a dictionary
letter_count = {}

for letter in alphabet:
    letter_count[letter] = 0
    
for word in synonyms['word']:
    # Set the word to lowercase
    word = word.lower()
    letter_count[word[0]] += 1

# Display the number of words that starts with each letter of the alphabet
for letter in 'abcdefghijklmnopqrstuvwxyz':
    print(letter + ': ' + str(letter_count[letter]))

Number of words: 210977
Number of words with each letter of the alphabet:
a: 31490
b: 5462
c: 21891
d: 23639
e: 27680
f: 7252
g: 3422
h: 1757
i: 11299
j: 830
k: 8
l: 4082
m: 7933
n: 1986
o: 3740
p: 16470
q: 621
r: 16751
s: 10846
t: 7240
u: 1268
v: 4369
w: 3
x: 166
y: 0
z: 539


In [10]:
# Displaying the dataframe
pd.set_option('display.max_colwidth', None)
synonyms

Unnamed: 0,word,synonyms
0,Abade,"[clérigo, confessor, cura, padre, prelado, pároco, sacerdote]"
1,Abadia,"[convento, mosteiro, presbitério, sé, basílica, catedral, igreja, santuário, templo, ádito]"
2,Abalo,"[trepar, concussão, mossa, efervescência, agitação, terremoto, emoção, comoção, choque, estremeção, trepidação, tremor, impulso, balanço, alvoroço, secussão, perturbação]"
3,Abarracamento,"[acampamento, aquartelamento, bivaque]"
4,Abrigada,"[resguardo, refúgio, abrigo, asilo, cobertura, reduto, valhacouto]"
...,...,...
210972,únguis,[úngue]
210973,única,"[uma, inédita]"
210974,único,"[sempar, singular, um, uno, incomparável, ímpar, só, inédito, inconfundível, sui generis, individual]"
210975,únicos,"[sós, uns, individuais, incomparáveis, incomparávéis, inconfundíveis, inéditos, ímpares, singulares]"


In [11]:
# Display the words and synonyms that starts with a specific letter
letter = 'k'

synonyms[synonyms['word'].str.lower().str.startswith(letter)].reset_index(drop=True)

Unnamed: 0,word,synonyms
0,kafkiano,"[absurdo, confuso, surreal]"
1,kaiser,"[soberano, rei, majestade]"
2,kamikaze,"[camicase, suicida]"
3,kardecismo,[espiritismo]
4,kit,"[conjunto, coleção, estojo]"
5,kitsch,"[ridículo, brega, cafona]"
6,kiwi,"[quivi, quiuí]"
7,know-how,"[inaptidão, inexperiência]"


In [12]:
# Displaying the synonyms of a specific word
word = 'regiões'

list(synonyms[synonyms['word'] == word]['synonyms'].values[0])

['nações', 'distritos', 'países', 'pátrias', 'províncias']

## Testing the augmentation techniques

### Synonym replacement

Possíveis melhorias:
- Identificar o tipo de palavra (substantivo, adjetivo, verbo, etc) e aplicar a substituição apenas para palavras do mesmo tipo
- Verificar o gênero da palavra e aplicar a substituição apenas para palavras do mesmo gênero

In [19]:
# Function to perform the data augmentation replacing the words with their synonyms with the following conditions:
# - Do not replace the words that are in the stop words list
# - Replace a specific number of words (percentage of the total number of words in the text), if the number is not specified, replace 50% of the words)

def synonyms_replacement(text, df, percentage=0.5):
    tokens = nltk.word_tokenize(text)
    stop_words = nltk.corpus.stopwords.words('portuguese')
    
    number_of_words = int(len(tokens) * percentage)
    indexes = np.random.choice(len(tokens), number_of_words, replace=False)
    
    for index in indexes:
        word = tokens[index]
        
        if word not in df['word'].values:
            continue
        
        if word in stop_words:
            continue
        
        synonyms = list(df[df['word'] == word]['synonyms'].values[0])
        
        if len(synonyms) == 0:
            continue
        
        synonym_index = np.random.randint(0, len(synonyms))
        tokens[index] = synonyms[synonym_index]
        
    return ' '.join(tokens)

In [22]:
# Testing the synonyms replacement function
sentence = 'Engenharia de dados é uma área que trata da transformação dos dados brutos de uma empresa' #Verificar problema de vírgula em frases

augmented_sentence = synonyms_replacement(sentence, synonyms)

# print the original text and the augmented text
print(sentence)
print(augmented_sentence)

Engenharia de dados é uma área que trata da transformação dos dados brutos de uma empresa
Engenharia de dados é uma área que cuida da mudança dos saberes bravios de uma empresa


### Back translation

In [28]:
# Translate the sentence to another language (English in this example) and then back to Portuguese
first_translation = ts.translate_text(sentence, translator='google', to_language='es')
second_translation = ts.translate_text(first_translation, translator='google', to_language='en')
back_translation = ts.translate_text(second_translation, translator='google', to_language='pt')

print(back_translation)

A engenharia de dados é uma área que lida com a transformação dos dados brutos de uma empresa


## Other tests (still in progress)

In [None]:
# First, you're going to need to import wordnet:
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

# Then, we're going to use the term "program" to find synsets like so:
syns = wordnet.synsets("set")

# Print all the synonyms
print(syns)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\artur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[Synset('set.n.01'), Synset('set.n.02'), Synset('set.n.03'), Synset('stage_set.n.01'), Synset('set.n.05'), Synset('bent.n.01'), Synset('set.n.07'), Synset('set.n.08'), Synset('hardening.n.02'), Synset('set.n.10'), Synset('set.n.11'), Synset('set.n.12'), Synset('set.n.13'), Synset('put.v.01'), Synset('determine.v.03'), Synset('specify.v.02'), Synset('set.v.04'), Synset('set.v.05'), Synset('set.v.06'), Synset('fix.v.12'), Synset('set.v.08'), Synset('set.v.09'), Synset('set.v.10'), Synset('arrange.v.06'), Synset('plant.v.01'), Synset('set.v.13'), Synset('jell.v.01'), Synset('typeset.v.01'), Synset('set.v.16'), Synset('set.v.17'), Synset('set.v.18'), Synset('sic.v.01'), Synset('place.v.11'), Synset('rig.v.04'), Synset('set_up.v.04'), Synset('adjust.v.01'), Synset('fructify.v.03'), Synset('dress.v.16'), Synset('fit.s.02'), Synset('fixed.s.02'), Synset('located.s.01'), Synset('laid.s.01'), Synset('set.s.05'), Synset('determined.s.04'), Synset('hardened.s.05')]


In [None]:
# load the parquet file from ../scrapy-sinonimos/synonyms_scraper/synonyms_scraper/synonyms.parquet
df = pq.read_table('../scrapy-sinonimos/synonyms_scraper/synonyms.parquet').to_pandas()

df

FileNotFoundError: ../scrapy-sinonimos/synonyms_scraper/synonyms.parquet