In [None]:
# installing necessary libraries to perform this task
!pip install torch torchvision transformers rasa==1.7.0 input_reader

In [None]:
# importing libraries
import numpy as np 
import pandas as pd
from sklearn import preprocessing
import ipywidgets as widgets
import requests, os
from IPython.display import display
from ipywidgets import interact

from rasa.nlu.training_data import TrainingData,Message

In [None]:
### Download model

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

model_class_file_id = '1N1kn2b7i2ND7eNefzyJM-k13IM8tqZvr'
checkpoint_file_id = '1G0nwXlvzGsb8Ar-OAnYBQKFvY97WMzBy'
model_class_destination = 'model.py'
checkpoint_destination = 'model.zip'
checkpoint_unzipped_destination = 'package_models'

if not os.path.exists(checkpoint_unzipped_destination):
    download_file_from_google_drive(checkpoint_file_id, checkpoint_destination)
    !unzip {checkpoint_destination}

if not os.path.exists(model_class_destination):
    download_file_from_google_drive(model_class_file_id, model_class_destination)

In [None]:
# loading model
from model import ParaphraseModel
model_path = 'package_models/lm_finetune_8/checkpoint-56000/'

complete_td = TrainingData()
model = ParaphraseModel(model_path)


In [None]:
# loading our dataset
train_df = pd.read_csv('../input/data-divide/reddit_data1.csv')

# dropping rows having null values
train_df.dropna(inplace=True)

# creating a label column to encode our text labels to no.
le = preprocessing.LabelEncoder()
le.fit(train_df["flair"])
train_df["label"] = le.transform(train_df["flair"])
train_df.head()

In [None]:
result_dict = {'text':[], "label":[]}

In [None]:
# here i have created a scheme for augmentation. Through this scheme, not for every sentence paraphrases
# will be calculated. It will depend on label of that text. So, for label which have a very low percentages in 
# in our data, for them the augmentation will be more aggressive. But for labels which have enough data,
# we are not augmenting.
for ind, i in train_df.iterrows():
    if (i["label"] == 6) or (i["label"] == 4):
        text = model.get_paraphrases(i["dirty_text"], 5, "")
        result_dict["text"].extend([text[1], i["dirty_text"], text[4]])
        result_dict["label"].extend([i["label"], i["label"], i["label"]])
        
    if i["label"] == 1:
        if np.random.random()<=0.84:
            text = model.get_paraphrases(i["dirty_text"], 5, "")
            result_dict["text"].extend([text[1],text[4]])
            result_dict["label"].extend([i["label"], i["label"], i["label"]])
        result_dict["text"].append(i["dirty_text"])
        
    if i["label"] == 2:
        if np.random.random()<=0.6:
            text = model.get_paraphrases(i["dirty_text"], 5, "")
            result_dict["text"].extend([text[1]])
            result_dict["label"].extend([i["label"], i["label"]])
        result_dict["text"].append(i["dirty_text"])
    
    if i["label"] == 0:
        if np.random.random()<=0.18:
            text = model.get_paraphrases(i["dirty_text"], 5, "")
            result_dict["text"].extend([text[1]])
            result_dict["label"].extend([i["label"], i["label"]])
        result_dict["text"].append(i["dirty_text"])
            


In [None]:
df = pd.DataFrame.from_dict(result_dict)
df.to_csv("augmented_data1.csv", index=False)