In [57]:

import json
import sys
import pandas as pd
import random
import nltk
from itertools import chain
from nltk.stem import WordNetLemmatizer
import inflect

In [58]:

nltk.download('averaged_perceptron_tagger')   # Downloading the required NLTK model
nltk.download('punkt')
nltk.download('wordnet')
p = inflect.engine()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/caio/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/caio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/caio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [62]:
def read_jsons():

    pool = []
    # Check if the user gave two arguments
    if len(sys.argv) != 3:
        print("Usage: python osar-jsons.py <json1> <json2>")
        sys.exit(1)

    # Open the first json
    with open(sys.argv[1]) as json_file:
        data1 = pd.json_normalize(json.load(json_file))
        pool.append(data1)

    # Open the second json
    with open(sys.argv[2]) as json_file:
        data2 = pd.json_normalize(json.load(json_file))
        pool.append(data2)

    return pd.concat(pool, ignore_index=True)

# A function that adds a new column to the dataframe pool called nouns
# for each existing row, the function will extract the nouns in the column placeholders
# and add to the new column
def extract_nouns(row):
    text_array = row['placeholders'] #Replace 'placeholders' with the actual name of the column containing your text data
    nouns = []
    for text in text_array:
        tokens = nltk.word_tokenize(text)
        tagged = nltk.pos_tag(tokens)
        # nouns.extend([word.lower() for word, pos in tagged if (pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS')])
        
        # iterate through the tagged words list and append singular nouns to the nouns list
        for word, pos in tagged:
            if (pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS'):
                # lemmatize the word to its singular form
                word = word.lower()
                word_s = p.singular_noun(word)
                # if the word is not changed into a singular form, keep it as it is
                if not word_s:
                    nouns.append(word)
                else:
                    nouns.append(word_s)
        
        # import the necessary library at the beginning of your code
        
        
        # # create an instance of the Word Net Lemmatizer class
        # wnl = WordNetLemmatizer()
        
        # # create an empty list to store the nouns
        # nouns = []
        
        # # iterate through the tagged words list and append singular nouns to the nouns list
        # for word, pos in tagged:
        #     if (pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS'):
        #         # lemmatize the word to its singular form
        #         word_s = wnl.lemmatize(word)
        #         # if the resulting word is the same as the original word, keep it as it is
        #         if word_s == word:
        #             nouns.append(word.lower())
        #         else:
        #             nouns.append(word_s.lower())
        

        
    return nouns


# A function that adds a new column to the dataframe pool called verbs
# for each existing row, the function will extract the verbs in the column template
# and add to the new column
def extract_verbs(row):
    text= row['template'] #Replace 'template' with the actual name of the column containing your text data
    verbs = []
    
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    verbs = [word.lower() for word, pos in tagged if (pos == 'VB' or pos == 'VBD' or pos == 'VBG' or pos == 'VBN' or pos == 'VBP' or pos == 'VBZ')]
    return verbs

def add_nouns_verb_column(pool):
    pool['nouns'] = pool.apply(extract_nouns, axis=1)
    pool['verbs'] = pool.apply(extract_verbs, axis=1)

In [63]:
import sys
sys.argv = ['play.py', 'labels/train.json',  'labels/validation.json']

pool = read_jsons()

add_nouns_verb_column(pool)
pool


Unnamed: 0,id,label,template,placeholders,nouns,verbs
0,78687,holding potato next to vicks vaporub bottle,Holding [something] next to [something],"[potato, vicks vaporub bottle]","[potato, vick, bottle]","[holding, []"
1,42326,spreading margarine onto bread,Spreading [something] onto [something],"[margarine, bread]","[margarine, bread]",[spreading]
2,100904,putting pen on a surface,Putting [something] on a surface,[pen],[],[putting]
3,80715,"lifting up one end of bottle, then letting it ...","Lifting up one end of [something], then lettin...",[bottle],[bottle],"[lifting, letting, drop]"
4,34899,holding bulb,Holding [something],[bulb],[bulb],[holding]
...,...,...,...,...,...,...
193685,151948,moving car key towards the camera,Moving [something] towards the camera,[car key],"[car, key]",[moving]
193686,117425,closing small freshmints,Closing [something],[small freshmints],[freshmint],[closing]
193687,157360,unfolding a shirt,Unfolding [something],[a shirt],[shirt],[unfolding]
193688,117478,moving glass up,Moving [something] up,[glass],[glas],"[moving, ]]"


In [64]:
# Function that chooses x random rows from the dataframe pool
def choose_random_nouns(pool, x):
    all_nouns = []
    past_index = []
    #  loop x times chosing one row at a time
    for i in range(x):
        while True:
            random_index = random.randint(0, len(pool)-1)
            if random_index not in past_index:
                past_index.append(random_index)
                break
        nouns = pool['nouns'][random_index]
        all_nouns.append(nouns)
    return all_nouns

def choose_random_verbs(pool, x):
    all_verbs = []
    past_index = []
    #  loop x times chosing one row at a time
    for i in range(x):
        while True:
            random_index = random.randint(0, len(pool)-1)
            if random_index not in past_index:
                past_index.append(random_index)
                break
        verbs = pool['verbs'][random_index]
        all_verbs.append(verbs)
    return all_verbs


In [65]:
unknown_nouns = choose_random_nouns(pool,10)
print(unknown_nouns)
unknown_verbs = choose_random_verbs(pool,10)
print(unknown_verbs)

[['piece', 'paper'], ['paper'], ['book'], ['plant'], ['paper'], ['cup', 'kettle'], ['bag'], ['pendrive'], ['broom', 'wall'], ['gum', 'bottle']]
[['pulling'], ['spinning', 'continues', 'spinning'], ['throwing'], ['pulling'], ['lifting', ']', 'letting', 'drop'], ['moving'], ['plugging', 'pulling', 'remove'], ['putting'], ['wiping', ']'], ['being', 'deflected']]


In [66]:
def create_df_known_labels_unkv(pool, unknown_nouns):

    # flatten unknown_nouns list
    flat_unknown_nouns = list(chain.from_iterable(unknown_nouns))
    
    # filter pool dataframe
    known_labels = pool[~pool['nouns'].apply(lambda x: any(item for item in x if item in flat_unknown_nouns))]
    unknown_labels = pool[pool['nouns'].apply(lambda x: any(item for item in x if item in flat_unknown_nouns))]

    return known_labels, unknown_labels

In [67]:
known_labels, unknown_labels = create_df_known_labels_unkv(pool, unknown_nouns)
unknown_labels

Unnamed: 0,id,label,template,placeholders,nouns,verbs
0,78687,holding potato next to vicks vaporub bottle,Holding [something] next to [something],"[potato, vicks vaporub bottle]","[potato, vick, bottle]","[holding, []"
3,80715,"lifting up one end of bottle, then letting it ...","Lifting up one end of [something], then lettin...",[bottle],[bottle],"[lifting, letting, drop]"
9,66533,folding paper towel,Folding [something],[paper towel],"[paper, towel]",[folding]
12,187992,pretending to take bottle from window side,Pretending to take [something] from [somewhere],"[bottle, window side]","[bottle, side]","[pretending, take, ]]"
18,134982,dropping headset in front of tools bag,Dropping [something] in front of [something],"[headset, tools bag]","[headset, tool, bag]",[dropping]
...,...,...,...,...,...,...
193670,87849,pulling two ends of paper so that it separates...,Pulling two ends of [something] so that it sep...,[paper],[paper],"[pulling, separates]"
193671,62096,showing mug behind kettle,Showing [something] behind [something],"[mug, kettle]","[mug, kettle]",[showing]
193672,171194,dropping a loonie into a cup,Dropping [something] into [something],"[a loonie, a cup]","[loonie, cup]",[dropping]
193673,40391,paper falling like a feather or paper,[Something] falling like a feather or paper,[paper],[paper],[falling]


In [68]:
def unknown_noun_known_verb(known_labels, unknown_labels):
    # Find all unique verbs in known_labels and convert to set
    known_verbs = set(known_labels['verbs'].explode().unique())
    
    # Create a boolean mask of unknown_labels rows where any verb is not in known_verbs
    mask = ~(unknown_labels['verbs'].explode().isin(known_verbs)).groupby(level=0).any()
    
    # Filter out the rows where the mask is True
    unknown_labels = unknown_labels.loc[~mask]
    
    # Split the known_labels dataframe into training and testing sets
    train_df = known_labels.sample(frac=0.7, random_state=42)
    print('size of train_df: ', len(train_df))
    test_df = known_labels.drop(train_df.index)
    print('size of test_df: ', len(test_df))
    
    # Append the unknown_labels dataframe to the test set
    # test_df = test_df.append(unknown_labels, ignore_index=True)
    test_df = pd.concat([test_df, unknown_labels], ignore_index=True)
    print('size of test_df: ', len(test_df))
    
    # Save the train and test dataframes to JSON files
    train_df.to_json('unkv_train.json', orient='records', lines=True)
    test_df.to_json('unkv_test.json', orient='records', lines=True)

In [69]:

unknown_noun_known_verb(known_labels, unknown_labels)

size of train_df:  108114
size of test_df:  46334
size of test_df:  85576


In [70]:
def create_df_known_labels_knuv(pool, unknown_verbs):

    # flatten unknown_verbs list
    flat_unknown_verbs = list(chain.from_iterable(unknown_verbs))
    
    # filter pool dataframe
    known_labels = pool[~pool['verbs'].apply(lambda x: any(item for item in x if item in flat_unknown_verbs))]
    unknown_labels = pool[pool['verbs'].apply(lambda x: any(item for item in x if item in flat_unknown_verbs))]

    return known_labels, unknown_labels

In [71]:
def known_noun_unknown_verb(known_labels, unknown_labels):
    # Find all unique nouns in known_labels and convert to set
    known_nouns = set(known_labels['nouns'].explode().unique())
    
    # Create a boolean mask of unknown_labels rows where any nouns is not in known_nouns
    mask = ~(unknown_labels['nouns'].explode().isin(known_nouns)).groupby(level=0).any()
    
    # Filter out the rows where the mask is True
    unknown_labels = unknown_labels.loc[~mask]
    
    # Split the known_labels dataframe into training and testing sets
    train_df = known_labels.sample(frac=0.7, random_state=42)
    print('size of train_df: ', len(train_df))
    test_df = known_labels.drop(train_df.index)
    print('size of test_df: ', len(test_df))
    
    # Append the unknown_labels dataframe to the test set
    test_df = pd.concat([test_df, unknown_labels], ignore_index=True)
    print('size of test_df: ', len(test_df))
    
    # Save the train and test dataframes to JSON files
    train_df.to_json('knuv_train.json', orient='records', lines=True)
    test_df.to_json('knuv_test.json', orient='records', lines=True)

In [72]:
known_labels, unknown_labels = create_df_known_labels_knuv(pool, unknown_verbs)
known_noun_unknown_verb(known_labels, unknown_labels)

size of train_df:  75780
size of test_df:  32477
size of test_df:  117553


In [73]:
def create_df_known_labels_unuv(pool, unknown_nouns, unknown_verbs):

    # flatten unknown_nouns and unknown_verbs list
    flat_unknown_nouns = list(chain.from_iterable(unknown_nouns))
    flat_unknown_verbs = list(chain.from_iterable(unknown_verbs))
    
    
    # filter pool dataframe
    known_labels = pool[~pool['nouns'].apply(lambda x: any(item for item in x if item in flat_unknown_nouns))]
    known_labels = known_labels[~known_labels['verbs'].apply(lambda x: any(item for item in x if item in flat_unknown_verbs))]
    unknown_labels = pool[pool['nouns'].apply(lambda x: any(item for item in x if item in flat_unknown_nouns))]
    unknown_labels = unknown_labels[unknown_labels['verbs'].apply(lambda x: any(item for item in x if item in flat_unknown_verbs))]

    return known_labels, unknown_labels

In [74]:
def unknown_noun_unknown_verb(known_labels, unknown_labels):
    # Find all unique nouns in known_labels and convert to set
    known_nouns = set(known_labels['nouns'].explode().unique())
    known_verbs = set(known_labels['verbs'].explode().unique())
    
    # Create a boolean mask of unknown_labels rows where any nouns is not in known_nouns
    mask = ~(unknown_labels['nouns'].explode().isin(known_nouns)).groupby(level=0).any()
    
    # Filter out the rows where the mask is True
    unknown_labels = unknown_labels.loc[~mask]

    # Create a boolean mask of unknown_labels rows where any verb is not in known_verbs
    mask = ~(unknown_labels['verbs'].explode().isin(known_verbs)).groupby(level=0).any()

    # Filter out the rows where the mask is True
    unknown_labels = unknown_labels.loc[~mask]
    
    # Split the known_labels dataframe into training and testing sets
    train_df = known_labels.sample(frac=0.7, random_state=42)
    print('size of train_df: ', len(train_df))
    test_df = known_labels.drop(train_df.index)
    print('size of test_df: ', len(test_df))
    
    # Append the unknown_labels dataframe to the test set
    test_df = pd.concat([test_df, unknown_labels], ignore_index=True)
    print('size of test_df: ', len(test_df))
    
    # Save the train and test dataframes to JSON files
    train_df.to_json('unuv_train.json', orient='records', lines=True)
    test_df.to_json('unuv_test.json', orient='records', lines=True)

In [75]:
known_labels, unknown_labels = create_df_known_labels_unuv(pool, unknown_nouns, unknown_verbs)
unknown_noun_unknown_verb(known_labels, unknown_labels)

size of train_df:  58092
size of test_df:  24897
size of test_df:  27859
