In [123]:
import numpy as np 
import pandas as pd 
import os
import random
import string
import copy

In [124]:
def text_preprocessing(excerpt):
    
    # lower casing
    excerpt = excerpt.lower()

    # removal of punctuation
    excerpt = excerpt.translate(str.maketrans('', '', string.punctuation))

                
    return excerpt

In [125]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [126]:
from nltk.corpus import stopwords
stop_words = []
for w in stopwords.words('english'):
    stop_words.append(w)

In [127]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet 

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [128]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [129]:
def synonym_replacement(words, n):
    
    words = words.split()
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
#             print("replaced", random_word, "with", synonym)
            num_replaced += 1
        
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return sentence

In [130]:
df["excerpt"] = df["excerpt"].apply(lambda x: text_preprocessing(x))

In [131]:
df["text"] = df["excerpt"].apply(lambda x: synonym_replacement(x, 2))

In [132]:
print("--- Example Translation ---")
print("")
print("Original Text: ", df.values[0,3])
print("")
print("Augmented Text: ", df.values[0,6])
print("")

--- Example Translation ---

Original Text:  when the young people returned to the ballroom it presented a decidedly changed appearance instead of an interior scene it was a winter landscape
the floor was covered with snowwhite canvas not laid on smoothly but rumpled over bumps and hillocks like a real snow field the numerous palms and evergreens that had decorated the room were powdered with flour and strewn with tufts of cotton like snow also diamond dust had been lightly sprinkled on them and glittering crystal icicles hung from the branches
at each end of the room on the wall hung a beautiful bearskin rug
these rugs were for prizes one for the girls and one for the boys and this was the game
the girls were gathered at one end of the room and the boys at the other and one end was called the north pole and the other the south pole each player was given a small flag which they were to plant on reaching the pole
this would have been an easy matter but each traveller was obliged to wear

In [133]:
def change_target(target, se):
    return target - random.uniform(0, se)

In [134]:
df['target']  = df.apply(lambda f: change_target(f['target'],f['standard_error']), axis=1)

In [135]:
df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,text
0,c12129c31,,,when the young people returned to the ballroom...,-0.611185,0.464009,when the young people returned to the ballroom...
1,85aa80a4c,,,all through dinner time mrs fayre was somewhat...,-0.397654,0.480805,all through dinner time mrs fayre was somewhat...
2,b69ac6792,,,as roger had predicted the snow departed as qu...,-1.023732,0.476676,as roger had predicted the snow departed as qu...
3,dd1000b26,,,and outside before the palace a great garden w...,-1.468637,0.450007,and outside before the palace a great garden w...
4,37c1b32fb,,,once upon a time there were three bears who li...,-0.227580,0.510845,once upon a time there were three bears who li...
...,...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,when you think of dinosaurs and where they liv...,1.226001,0.646900,when you think of dinosaurs and where they liv...
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,so what is a solid solids are usually hard bec...,-0.127713,0.535648,so what is a solid solids are usually hard bec...
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,the second state of matter we will discuss is ...,-0.044433,0.483866,the second state of matter we will discuss is ...
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,solids are shapes that you can actually touch ...,-0.335626,0.514128,solids are shape that you can in reality touch...


In [136]:
df.to_csv("commonlit_train_with_augs_syn.csv", index=False)