In [None]:
import torch
import numpy as np
# Clear GPU cache
torch.cuda.empty_cache()

# Check GPU memory usage
!nvidia-smi

In [None]:
!pip install -U transformers
!pip install -U torch

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU.")

In [None]:
import pandas as pd

In [None]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1IG7APAMDUOBfToE_NDM9uKczkIUNC5__d9BsEK1DMIY/edit#gid=1880462315'
url_1 = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
df = pd.read_csv(url_1, header=None)
df

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


### Paraphrase Augmentation


In [None]:
!pip install sentencepiece
!pip install transformers

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

def generate_paraphrase(text, model, tokenizer, num_return_sequences=1):
    input_text = "paraphrase: " + text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    input_ids = input_ids.to('cuda')
    outputs = model.generate(
      input_ids=input_ids,
      max_length=256,
      num_return_sequences=num_return_sequences,
      top_k=120,
      top_p=0.95,
      temperature=0.9,
    )

    paraphrases = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return paraphrases

model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to('cuda' if torch.cuda.is_available() else 'cpu')

text = "Stock prices soared after the company announced better-than-expected earnings."
paraphrases = generate_paraphrase(text, model, tokenizer, num_return_sequences=1)

for i, paraphrase in enumerate(paraphrases):
    print(f"Paraphrase {i + 1}: {paraphrase}")

Paraphrase 1: True


### Paraphrase Augmentation (II)

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

def generate_phrases(text, num_phrases=1):
    model_name = 't5-base'
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)

    model.eval()

    inputs = tokenizer.encode("paraphrase: " + text, return_tensors="pt")
    outputs = model.generate(inputs, num_return_sequences=num_phrases, max_length=256, num_beams=10, temperature=0.6)

    phrases = []
    for output in outputs:
        phrase = tokenizer.decode(output, skip_special_tokens=True)
        phrases.append(phrase)

    return phrases

text = "I don't like eating chocolate that tastes average, as it is a waste of calories. I apply this to other food too."
paraphrased_phrases = generate_phrases(text, num_phrases=1)

for i, phrase in enumerate(paraphrased_phrases):
    print(f"{i+1}: {phrase}")

1: False


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

def paraphrase_sentence(sentence, model_name='t5-base', max_length=315):
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)

    text = f"paraphrase: {sentence}"
    encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    outputs = model.generate(
        input_ids=encoding['input_ids'], 
        attention_mask=encoding['attention_mask'], 
        max_length=max_length,
        num_return_sequences=1,
        num_beams=10,
        temperature=0.8,
        no_repeat_ngram_size=2
    )
    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return paraphrased_text

input_sentence = "The quick brown fox jumps over the lazy dog."
paraphrased = paraphrase_sentence(input_sentence)
print(f"Original: {input_sentence}")
print(f"Paraphrased: {paraphrased}")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Original: The quick brown fox jumps over the lazy dog.
Paraphrased: False


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

def paraphrase_sentence(sentence, model_name='t5-large', max_length=315):
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)

    text = f"paraphrase: {sentence}"
    encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    outputs = model.generate(input_ids=encoding['input_ids'], attention_mask=encoding['attention_mask'], max_length=max_length)
    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return paraphrased_text

input_sentence = "According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
paraphrased = paraphrase_sentence(input_sentence)
print(f"Original: {input_sentence}")
print(f"Paraphrased: {paraphrased}")

Original: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Paraphrased: Gran


### Synonym Augmentation

In [None]:
import nltk
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('wordnet')

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return set(synonyms)

def replace_with_synonyms(text):
    words = nltk.word_tokenize(text)
    new_text = []
    for word in words:
        synonyms = get_synonyms(word)
        if synonyms:
            new_text.append(synonyms.pop())
        else:
            new_text.append(word)
    return ' '.join(new_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
texts_aug = []
labels_aug = []
for i in range(len(texts)):
  texts_aug.append(replace_with_synonyms(texts[i]))
  labels_aug.append(labels[i])

In [None]:
print('from non-augmentated dataset:')
print(texts[0])
print(labels[0])
print('from augmentated dataset:')
print(texts[4846])
print(labels[4846])

from non-augmentated dataset:
According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
neutral
from augmentated dataset:
grant to gran , the caller take_in no architectural_plan to make_a_motion entirely production to Russia , although that constitute where the caller constitute uprise .
neutral


### Language Augmentation

In [None]:
import torch

from transformers import pipeline

def augment_data(texts, device=0):
    # Load the translation pipelines for English to French and French to English
    en_to_fr = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr", device=device)
    fr_to_en = pipeline("translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en", device=device)

    # Translate the input English text to French
    french_texts = en_to_fr(texts, max_length=315)

    # Extract the translated French text
    french_texts = [text["translation_text"] for text in french_texts]

    # Translate the French text back to English
    backtranslated_texts = fr_to_en(french_texts, max_length=315)

    # Extract the backtranslated English text
    backtranslated_texts = [text["translation_text"] for text in backtranslated_texts]

    return backtranslated_texts



# text = "According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
# augmented_text = augment_data(text)
# print(augmented_text)

In [None]:
from transformers import pipeline

def augment_data_batch(text, batch_size=10):
    # Load the translation pipelines for English to French and French to English
    en_to_fr = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
    fr_to_en = pipeline("translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en")
    
    # Split the input text into batches of the specified size
    batches = [text[i:i+batch_size] for i in range(0, len(text), batch_size)]
    
    # Translate each batch from English to French, then from French back to English
    translated_batches = []
    for batch in batches:
        french_batch = en_to_fr(batch, max_length=315, padding=True, truncation=True)
        backtranslated_batch = fr_to_en(french_batch, max_length=315, padding=True, truncation=True)
        translated_batches.append(backtranslated_batch)
    
    # Combine the translated batches into a single string and return it
    translated_text = ""
    for batch in translated_batches:
        for translation in batch:
            translated_text += translation["translation_text"]
    
    return translated_text


In [None]:
!pip install sacremoses

In [None]:
import torch
texts_aug = []
labels_aug = []
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# for i in range(len(X_train)):
#   print(i)
#   texts_aug.append(augment_data(X_train[i]))
#   labels_aug.append(t_train[i])
texts_aug = augment_data(X_train.tolist())

with open("/content/drive/MyDrive/CSC413/Project/aug_data.pkl", "wb") as f:
    pickle.dump([texts_aug, t_train], f)

In [None]:
texts = np.concatenate([texts, texts_aug])
labels = np.concatenate([labels, labels_aug])

In [None]:
import numpy as np
import pickle

# Create two numpy arrays
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

# Create a list that contains both arrays
data = [arr1, arr2]

# Dump the list to a pickle file
with open("/content/drive/MyDrive/CSC413/Project/aug_data.pkl", "wb") as f:
    pickle.dump(data, f)


In [None]:
# load from the pickle file
with open("/content/drive/MyDrive/CSC413/Project/aug_data.pkl", "rb") as f:
    np_array_loaded = pickle.load(f)

print(np_array_loaded) # [1 2 3 4 5]

[array([1, 2, 3]), array([4, 5, 6])]


In [None]:
print('from non-augmentated dataset:')
print(texts[1])
print(labels[1])
print('from augmentated dataset:')
print(texts[4847])
print(labels[4847])