In [1]:
!pip install inflect
!pip install nltk


Collecting inflect
  Obtaining dependency information for inflect from https://files.pythonhosted.org/packages/fb/c6/d9feb758be584f729424390af24687d3a4363d968164f94079f83cd536b4/inflect-7.0.0-py3-none-any.whl.metadata
  Using cached inflect-7.0.0-py3-none-any.whl.metadata (21 kB)
Collecting pydantic>=1.9.1 (from inflect)
  Obtaining dependency information for pydantic>=1.9.1 from https://files.pythonhosted.org/packages/e2/2c/9906b7abc337b0250a5634de5396e2f3cb1b837af0616424c2225a65aa80/pydantic-2.5.1-py3-none-any.whl.metadata
  Downloading pydantic-2.5.1-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.1/64.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting annotated-types>=0.4.0 (from pydantic>=1.9.1->inflect)
  Obtaining dependency information for annotated-types>=0.4.0 from https://files.pythonhosted.org/packages/28/78/d31230046e58c207284c6b2c4e8d96e6d3cb4e52354721b944d3e1ee4aa5/annotated_types-0.6.0-py3-none-any.whl.metadat

In [2]:

import json
import sys
import pandas as pd
import random
import nltk
from itertools import chain
from nltk.stem import WordNetLemmatizer

In [3]:

nltk.download('averaged_perceptron_tagger')   # Downloading the required NLTK model
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
p = WordNetLemmatizer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/caio/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/caio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/caio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/caio/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
json_file_name = 'misspelled_nouns.json'
with open(json_file_name, 'r') as json_file:
    misspelled_nouns = json.load(json_file)


json_file_name = 'corrected_nouns.json'
with open(json_file_name, 'r') as json_file:
    corrected_nouns = json.load(json_file)

In [8]:
def read_jsons():

    pool = []
    # Check if the user gave two arguments
    if len(sys.argv) != 3:
        print("Usage: python osar-jsons.py <json1> <json2>")
        sys.exit(1)

    # Open the first json
    with open(sys.argv[1]) as json_file:
        data1 = pd.json_normalize(json.load(json_file))
        pool.append(data1)

    # Open the second json
    with open(sys.argv[2]) as json_file:
        data2 = pd.json_normalize(json.load(json_file))
        pool.append(data2)

    return pd.concat(pool, ignore_index=True)

# A function that adds a new column to the dataframe pool called nouns
# for each existing row, the function will extract the nouns in the column placeholders
# and add to the new column
def extract_nouns(row):
    text_array = row['placeholders'] #Replace 'placeholders' with the actual name of the column containing your text data
    nouns = []
    blocked_strings = ['..', '.40', '/', '\\', ']','%', '‘','’']
    for text in text_array:
        tokens = nltk.word_tokenize(text)
        tagged = nltk.pos_tag(tokens)
        # nouns.extend([word.lower() for word, pos in tagged if (pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS')])
        
        # iterate through the tagged words list and append singular nouns to the nouns list
        for word, pos in tagged:
            if (pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS' or pos == 'VB') and (not word.isnumeric()) and (word not in blocked_strings) and ("'" not in word) and (len(word) > 1):
                # lemmatize the word to its singular form
                word = word.lower()
                word_s = p.lemmatize(word, pos='n')
                 # if the word is not changed into a singular form, keep it as it is
                if not word_s:
                    # nouns.append(word)
                    singular_noun = word
                else:
                    # nouns.append(word_s)
                    singular_noun = word_s
                if singular_noun in misspelled_nouns and corrected_nouns[misspelled_nouns.index(singular_noun)] is not None:
                    nouns.append(corrected_nouns[misspelled_nouns.index(singular_noun)])
                else:
                    nouns.append(singular_noun)

    return nouns


# A function that adds a new column to the dataframe pool called verbs
# for each existing row, the function will extract the verbs in the column template
# and add to the new column
def extract_verbs(row):
    text= row['template'] #Replace 'template' with the actual name of the column containing your text data
    verbs = []
    
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    verbs = [word.lower() for word, pos in tagged if (pos == 'VB' or pos == 'VBD' or pos == 'VBG' or pos == 'VBN' or pos == 'VBP' or pos == 'VBZ')and (not word.isnumeric())]
    return verbs

def add_nouns_verb_column(pool):
    pool['nouns'] = pool.apply(extract_nouns, axis=1)
    pool['verbs'] = pool.apply(extract_verbs, axis=1)

In [9]:
import sys
sys.argv = ['play.py', 'labels/train.json',  'labels/validation.json']

pool = read_jsons()

add_nouns_verb_column(pool)
pool


In [None]:
# Extract unique words from the 'text' column
unique_words = set(word for row in pool['nouns'] for word in row)

# Convert the set of unique words to a list if needed
unique_words_list = sorted(list(unique_words))

print(unique_words_list)
print(len(unique_words_list))

['a-tshirt', 'a/c', 'aa', 'ab', 'abacus', 'abb', 'abby', 'abinder', 'abook', 'absorbent', 'absorber', 'ac', 'acake', 'accelarator', 'accelerator', 'accesories', 'access', 'accessoire', 'accessory', 'accumulator', 'acerola', 'acetaminophen', 'acetone', 'achaar', 'acorn', 'action', 'activation', 'activator', 'activia', 'ad', 'adapter', 'adaptor', 'adata', 'addepter', 'adhesive', 'adjuster', 'adopter', 'adoptor', 'adsl', 'adult', 'advent', 'advertisement', 'advertising', 'advertisment', 'advil', 'advisement', 'aeroplane', 'aerosal', 'aerosol', 'aerrings', 'aftershave', 'agarbathi', 'agarbathy', 'agarbatti', 'agarpathi', 'agenda', 'agent', 'agitator', 'ahoe', 'aid', 'aide', 'aiirbud', 'aim', 'aiphone', 'air', 'air-plane', 'aircondition', 'airconditioner', 'aircraft', 'airfreshner', 'airgun', 'airplane', 'airpod', 'airpods', 'airpump', 'airscrew', 'airspray', 'ajax', 'ajinomoto', 'aknife', 'alamara', 'alamera', 'alaram', 'alarm', 'album', 'alchol', 'alcohol', 'aldrop', 'ale', 'alebrije', 'a

In [None]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.7.2-py3-none-any.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [None]:
from spellchecker import SpellChecker

# Initialize the spell checker
spell = SpellChecker()

# Find potentially misspelled words
misspelled_nouns = spell.unknown(unique_words_list)

corrected_nouns = [spell.correction(word) for word in misspelled_nouns]

json_file_name = 'misspelled_nouns.json'
with open(json_file_name, 'w') as json_file:
    json.dump(list(misspelled_nouns), json_file)


json_file_name = 'corrected_nouns.json'
with open(json_file_name, 'w') as json_file:
    json.dump(list(corrected_nouns), json_file)



In [None]:
mpws = list(misspelled_words)

final_words = [corrected_words[mpws.index(word)] if word in misspelled_words and corrected_words[mpws.index(word)] is not None else word for word in unique_words_list]

In [None]:
misspelled_words

{'pancil',
 'gaurd',
 'exacto',
 'autorickshaw',
 'weedeater',
 'sunchair',
 'pecock',
 'burse',
 'bicyclist',
 'glace',
 'airscrew',
 'evelope',
 'aun',
 'casettes',
 'glases',
 'gassstove',
 'w',
 'gomuthra',
 'fevicol',
 'botlle',
 'injectio',
 'sockdrawer',
 'tinter',
 'spinwheel',
 'bhatter',
 'toy-gun',
 'hp',
 'pag',
 'stapples',
 'chappels',
 'goreng',
 'mirchi',
 'mixie',
 'black-ink',
 'deck-chair',
 'shampooes',
 'cigarettebox',
 'disckgolf',
 'hair-oil',
 'screwcap',
 'hairclipper',
 'setsquare',
 'scredriver',
 'spects',
 'keychian',
 'oinment',
 'godrej',
 'scrole',
 'protractor',
 'duracell',
 'lense',
 'plaver',
 'tostter',
 'droor',
 'bouguet',
 'appple',
 'calcualater',
 'handtowels',
 'lip-care',
 'spillikin',
 'batterry',
 'wall/tarp',
 'redchile',
 'remotecontroller',
 'couo',
 'talucm',
 'casette',
 'christmastree',
 'karpet',
 'earjack',
 'make-up',
 'footbag',
 'plag',
 'drawar',
 'gova',
 'ush',
 'alumini',
 'kodam',
 'hunder',
 'kuerig',
 'breacker',
 'samsung

In [None]:
corrected_words

['pencil',
 'guard',
 'exact',
 None,
 None,
 'unchain',
 'peacock',
 'nurse',
 'bicycles',
 'place',
 None,
 'envelope',
 'an',
 'cassettes',
 'glass',
 None,
 'i',
 None,
 None,
 'bottle',
 'injection',
 None,
 'winter',
 'pinwheel',
 'chatter',
 None,
 'he',
 'pay',
 'staples',
 'chapels',
 'goring',
 'michi',
 'pixie',
 'blacking',
 None,
 'shampoos',
 None,
 None,
 None,
 'screwup',
 None,
 None,
 'screwdriver',
 'aspects',
 'keychain',
 'ointment',
 'gore',
 'scroll',
 'protector',
 'purcell',
 'sense',
 'player',
 'totter',
 'door',
 'bouquet',
 'apple',
 'calculate',
 None,
 None,
 None,
 'battery',
 None,
 None,
 None,
 'coup',
 'talcum',
 'cassette',
 'christmastime',
 'carpet',
 'carjack',
 'makeup',
 'football',
 'play',
 'drawer',
 'nova',
 'us',
 'alumni',
 'kadam',
 'under',
 'keri',
 'breaker',
 'sassing',
 'headlight',
 None,
 'toy',
 'cameraman',
 None,
 'eraser',
 None,
 'dummies',
 'deeper',
 None,
 'highlighter',
 'perfume',
 'sagebrush',
 'battery',
 'santa',
 'de

In [None]:
pool_filtered = pool
pool_filtered['verb+noun'] = pool_filtered['verbs'].str[0] + ' ' + pool_filtered['nouns'].str[0]
pool_filtered['verb+noun']

0              holding potato
1         spreading margarine
2                 putting pen
3              lifting bottle
4                holding bulb
                 ...         
193685             moving car
193686      closing freshmint
193687        unfolding shirt
193688            moving glas
193689        falling plastic
Name: verb+noun, Length: 193690, dtype: object

In [None]:
# stats info on how many rows have the same verb+noun for each label, in ascending order
pool_filtered.groupby('label')['verb+noun'].value_counts().sort_values()

In [None]:
# remove all the rouns in which the amount of rows with the same verb+noun is less than 5
pool_filtered = pool_filtered.groupby('label').filter(lambda x: len(x) > 5)

In [44]:
# pool['nouns'].str.contains(r'^-?\d+(?:\.\d+)?$').any()
# pool[pool['nouns'].str.len() < 3]
# pool['nouns'].str.len()
# pool[pool['nouns'].apply(lambda x: any(len(i) < 2 for i in x))]
# pool[~pool['nouns'].astype(bool)]

Unnamed: 0,id,label,template,placeholders,nouns,verbs
2,100904,putting pen on a surface,Putting [something] on a surface,[pen],[],[putting]
104,200377,pretending to throw pen,Pretending to throw [something],[pen],[],"[pretending, throw]"
106,170861,putting pen on a surface,Putting [something] on a surface,[pen],[],[putting]
182,139538,squeezing eatable,Squeezing [something],[eatable],[],[squeezing]
247,202794,bending pen until it breaks,Bending [something] until it breaks,[pen],[],"[bending, breaks]"
...,...,...,...,...,...,...
193524,39756,pushing pen from right to left,Pushing [something] from right to left,[pen],[],"[pushing, left]"
193528,32750,letting pen roll along a flat surface,Letting [something] roll along a flat surface,[pen],[],[letting]
193560,65088,putting pen next to pen,Putting [something] next to [something],"[pen, pen]",[],"[putting, []"
193633,112050,opening pouch,Opening [something],[pouch],[],[opening]


In [21]:
def unique_templates(pool):
    """
    Returns a list of unique strings in the 'template' column of a Pandas DataFrame.
    """
    unique = pool['template'].unique()
    return list(unique)

uni_template = unique_templates(pool)
print('size uni_templates: ', len(uni_template))
uni_template

size uni_templates:  174


['Holding [something] next to [something]',
 'Spreading [something] onto [something]',
 'Putting [something] on a surface',
 'Lifting up one end of [something], then letting it drop down',
 'Holding [something]',
 'Pushing [something] from right to left',
 'Spilling [something] onto [something]',
 'Pushing [something] so that it slightly moves',
 'Moving [something] across a surface until it falls down',
 'Folding [something]',
 'Taking [one of many similar things on the table]',
 'Pretending to turn [something] upside down',
 'Pretending to take [something] from [somewhere]',
 'Moving [something] up',
 'Twisting (wringing) [something] wet until water comes out',
 'Twisting [something]',
 'Taking [something] out of [something]',
 'Dropping [something] in front of [something]',
 'Throwing [something] in the air and letting it fall',
 'Lifting up one end of [something] without letting it drop down',
 'Moving [something] towards the camera',
 'Showing that [something] is empty',
 'Droppin

In [22]:
# Function that chooses x random rows from the dataframe pool
def choose_random_templates(unique_templates, x):
    all_templates = []
    past_index = []
    #  loop x times chosing one row at a time
    for i in range(x):
        while True:
            random_index = random.randint(0, len(unique_templates)-1)
            if random_index not in past_index:
                past_index.append(random_index)
                break
        templates = unique_templates[random_index]
        all_templates.append(templates)
    return all_templates

In [23]:
unknown_templates = choose_random_templates(uni_template, 10)
print(unknown_templates)

['Throwing [something] in the air and letting it fall', 'Pulling [something] onto [something]', 'Putting [something similar to other things that are already on the table]', 'Pretending to take [something] from [somewhere]', 'Turning the camera upwards while filming [something]', 'Rolling [something] on a flat surface', 'Pretending to take [something] out of [something]', 'Approaching [something] with your camera', 'Pulling two ends of [something] but nothing happens', 'Folding [something]']


In [24]:
def create_df_known_labels_atomic(pool, unknown_templates):

    # Filter the DataFrame to exclude rows with templates in the unknown_templates array
    known_labels = pool[~pool['template'].isin(unknown_templates)]
    unknown_labels = pool[pool['template'].isin(unknown_templates)]

    return known_labels, unknown_labels



In [25]:
known_labels, unknown_labels = create_df_known_labels_atomic(pool, unknown_templates)
print(known_labels)

    


            id                                              label   
0        78687        holding potato next to vicks vaporub bottle  \
1        42326                     spreading margarine onto bread   
2       100904                           putting pen on a surface   
3        80715  lifting up one end of bottle, then letting it ...   
4        34899                                       holding bulb   
...        ...                                                ...   
193685  151948                  moving car key towards the camera   
193686  117425                           closing small freshmints   
193687  157360                                  unfolding a shirt   
193688  117478                                    moving glass up   
193689   36585            plastic falling like a feather or paper   

                                                 template   
0                 Holding [something] next to [something]  \
1                  Spreading [something] onto [so

In [26]:
print(unknown_labels)

            id                                              label   
9        66533                                folding paper towel  \
12      187992         pretending to take bottle from window side   
19       37055  throwing sunscreen in the air and letting it fall   
98      209369             pretending to take matchbox from shelf   
101     148739                                   putting tweezers   
...        ...                                                ...   
193584   49392                                       putting keyd   
193610  163771  putting a bell pepper with a group of bell pep...   
193643  149405        throwing pen in the air and letting it fall   
193675  110293          pretending to take screwdriver from floor   
193678  122099     throwing a ball in the air and letting it fall   

                                                 template   
9                                     Folding [something]  \
12        Pretending to take [something] from [so

In [27]:
def atomic_open_set(known_labels, unknown_labels):
    # Split the known_labels dataframe into training and testing sets
    train_df = known_labels.sample(frac=0.7, random_state=42)
    print('size of train_df: ', len(train_df))
    test_df = known_labels.drop(train_df.index)
    print('size of test_df: ', len(test_df))
    
    # Append the unknown_labels dataframe to the test set
    # test_df = test_df.append(unknown_labels, ignore_index=True)
    test_df = pd.concat([test_df, unknown_labels], ignore_index=True)
    print('size of test_df: ', len(test_df))
    
    # Save the train and test dataframes to JSON files
    train_df.to_json('atomic_train.json', orient='records', lines=True)
    test_df.to_json('atomic_test.json', orient='records', lines=True)

atomic_open_set(known_labels, unknown_labels)

size of train_df:  127816
size of test_df:  54779
size of test_df:  65874


In [46]:
# Function that chooses x random rows from the dataframe pool
def choose_random_nouns(pool, x):
    all_nouns = []
    past_index = []
    count_all_nouns=0
    #  loop x times chosing one row at a time
    while True:
        while True:
            random_index = random.randint(0, len(pool)-1)
            nouns = pool['nouns'][random_index]
            # Flatten the nested list into a single list
            count_index_nouns = len(set([item for sublist in nouns for item in sublist if item not in ['', None]]))
            if (random_index not in past_index) and (count_all_nouns + count_index_nouns <= x):
                past_index.append(random_index)
                break
        all_nouns.append(nouns)
        count_all_nouns=len(set([item for sublist in all_nouns for item in sublist if item not in ['', None]]))
        if count_all_nouns == x:
            break
    print(count_all_nouns)
    return set([item for sublist in all_nouns for item in sublist if item not in ['', None]])

def choose_random_verbs(pool, x):
    all_verbs = []
    past_index = []
    count_all_verbs=0
    #  loop x times chosing one row at a time
    for i in range(x):
        while True:
            random_index = random.randint(0, len(pool)-1)
            verbs = pool['verbs'][random_index]
            count_index_verbs=len(set([item for sublist in verbs for item in sublist if item not in ['', None]]))
            if (random_index not in past_index) and (count_all_verbs + count_index_verbs <= x):
                past_index.append(random_index)
                break
        
        all_verbs.append(verbs)
        count_all_verbs=len(set([item for sublist in all_verbs for item in sublist if item not in ['', None]]))
    return all_verbs


In [47]:
unknown_nouns = choose_random_nouns(pool,10)
print(unknown_nouns)
# unknown_verbs = choose_random_verbs(pool,10)
# print(unknown_verbs)

10
{'walnut', 'cd', 'hair', 'comb', 'water', 'seat', 'clip', 'box', 'pen', 'k'}


In [66]:
def create_df_known_labels_unkv(pool, unknown_nouns):

    # flatten unknown_nouns list
    flat_unknown_nouns = list(chain.from_iterable(unknown_nouns))
    
    # filter pool dataframe
    known_labels = pool[~pool['nouns'].apply(lambda x: any(item for item in x if item in flat_unknown_nouns))]
    unknown_labels = pool[pool['nouns'].apply(lambda x: any(item for item in x if item in flat_unknown_nouns))]

    return known_labels, unknown_labels

In [67]:
known_labels, unknown_labels = create_df_known_labels_unkv(pool, unknown_nouns)
unknown_labels

Unnamed: 0,id,label,template,placeholders,nouns,verbs
0,78687,holding potato next to vicks vaporub bottle,Holding [something] next to [something],"[potato, vicks vaporub bottle]","[potato, vick, bottle]","[holding, []"
3,80715,"lifting up one end of bottle, then letting it ...","Lifting up one end of [something], then lettin...",[bottle],[bottle],"[lifting, letting, drop]"
9,66533,folding paper towel,Folding [something],[paper towel],"[paper, towel]",[folding]
12,187992,pretending to take bottle from window side,Pretending to take [something] from [somewhere],"[bottle, window side]","[bottle, side]","[pretending, take, ]]"
18,134982,dropping headset in front of tools bag,Dropping [something] in front of [something],"[headset, tools bag]","[headset, tool, bag]",[dropping]
...,...,...,...,...,...,...
193670,87849,pulling two ends of paper so that it separates...,Pulling two ends of [something] so that it sep...,[paper],[paper],"[pulling, separates]"
193671,62096,showing mug behind kettle,Showing [something] behind [something],"[mug, kettle]","[mug, kettle]",[showing]
193672,171194,dropping a loonie into a cup,Dropping [something] into [something],"[a loonie, a cup]","[loonie, cup]",[dropping]
193673,40391,paper falling like a feather or paper,[Something] falling like a feather or paper,[paper],[paper],[falling]


In [68]:
def unknown_noun_known_verb(known_labels, unknown_labels):
    # Find all unique verbs in known_labels and convert to set
    known_verbs = set(known_labels['verbs'].explode().unique())
    
    # Create a boolean mask of unknown_labels rows where any verb is not in known_verbs
    mask = ~(unknown_labels['verbs'].explode().isin(known_verbs)).groupby(level=0).any()
    
    # Filter out the rows where the mask is True
    unknown_labels = unknown_labels.loc[~mask]
    
    # Split the known_labels dataframe into training and testing sets
    train_df = known_labels.sample(frac=0.7, random_state=42)
    print('size of train_df: ', len(train_df))
    test_df = known_labels.drop(train_df.index)
    print('size of test_df: ', len(test_df))
    
    # Append the unknown_labels dataframe to the test set
    # test_df = test_df.append(unknown_labels, ignore_index=True)
    test_df = pd.concat([test_df, unknown_labels], ignore_index=True)
    print('size of test_df: ', len(test_df))
    
    # Save the train and test dataframes to JSON files
    train_df.to_json('unkv_train.json', orient='records', lines=True)
    test_df.to_json('unkv_test.json', orient='records', lines=True)

In [69]:

unknown_noun_known_verb(known_labels, unknown_labels)

size of train_df:  108114
size of test_df:  46334
size of test_df:  85576


In [70]:
def create_df_known_labels_knuv(pool, unknown_verbs):

    # flatten unknown_verbs list
    flat_unknown_verbs = list(chain.from_iterable(unknown_verbs))
    
    # filter pool dataframe
    known_labels = pool[~pool['verbs'].apply(lambda x: any(item for item in x if item in flat_unknown_verbs))]
    unknown_labels = pool[pool['verbs'].apply(lambda x: any(item for item in x if item in flat_unknown_verbs))]

    return known_labels, unknown_labels

In [71]:
def known_noun_unknown_verb(known_labels, unknown_labels):
    # Find all unique nouns in known_labels and convert to set
    known_nouns = set(known_labels['nouns'].explode().unique())
    
    # Create a boolean mask of unknown_labels rows where any nouns is not in known_nouns
    mask = ~(unknown_labels['nouns'].explode().isin(known_nouns)).groupby(level=0).any()
    
    # Filter out the rows where the mask is True
    unknown_labels = unknown_labels.loc[~mask]
    
    # Split the known_labels dataframe into training and testing sets
    train_df = known_labels.sample(frac=0.7, random_state=42)
    print('size of train_df: ', len(train_df))
    test_df = known_labels.drop(train_df.index)
    print('size of test_df: ', len(test_df))
    
    # Append the unknown_labels dataframe to the test set
    test_df = pd.concat([test_df, unknown_labels], ignore_index=True)
    print('size of test_df: ', len(test_df))
    
    # Save the train and test dataframes to JSON files
    train_df.to_json('knuv_train.json', orient='records', lines=True)
    test_df.to_json('knuv_test.json', orient='records', lines=True)

In [72]:
known_labels, unknown_labels = create_df_known_labels_knuv(pool, unknown_verbs)
known_noun_unknown_verb(known_labels, unknown_labels)

size of train_df:  75780
size of test_df:  32477
size of test_df:  117553


In [73]:
def create_df_known_labels_unuv(pool, unknown_nouns, unknown_verbs):

    # flatten unknown_nouns and unknown_verbs list
    flat_unknown_nouns = list(chain.from_iterable(unknown_nouns))
    flat_unknown_verbs = list(chain.from_iterable(unknown_verbs))
    
    
    # filter pool dataframe
    known_labels = pool[~pool['nouns'].apply(lambda x: any(item for item in x if item in flat_unknown_nouns))]
    known_labels = known_labels[~known_labels['verbs'].apply(lambda x: any(item for item in x if item in flat_unknown_verbs))]
    unknown_labels = pool[pool['nouns'].apply(lambda x: any(item for item in x if item in flat_unknown_nouns))]
    unknown_labels = unknown_labels[unknown_labels['verbs'].apply(lambda x: any(item for item in x if item in flat_unknown_verbs))]

    return known_labels, unknown_labels

In [74]:
def unknown_noun_unknown_verb(known_labels, unknown_labels):
    # Find all unique nouns in known_labels and convert to set
    known_nouns = set(known_labels['nouns'].explode().unique())
    known_verbs = set(known_labels['verbs'].explode().unique())
    
    # Create a boolean mask of unknown_labels rows where any nouns is not in known_nouns
    mask = ~(unknown_labels['nouns'].explode().isin(known_nouns)).groupby(level=0).any()
    
    # Filter out the rows where the mask is True
    unknown_labels = unknown_labels.loc[~mask]

    # Create a boolean mask of unknown_labels rows where any verb is not in known_verbs
    mask = ~(unknown_labels['verbs'].explode().isin(known_verbs)).groupby(level=0).any()

    # Filter out the rows where the mask is True
    unknown_labels = unknown_labels.loc[~mask]
    
    # Split the known_labels dataframe into training and testing sets
    train_df = known_labels.sample(frac=0.7, random_state=42)
    print('size of train_df: ', len(train_df))
    test_df = known_labels.drop(train_df.index)
    print('size of test_df: ', len(test_df))
    
    # Append the unknown_labels dataframe to the test set
    test_df = pd.concat([test_df, unknown_labels], ignore_index=True)
    print('size of test_df: ', len(test_df))
    
    # Save the train and test dataframes to JSON files
    train_df.to_json('unuv_train.json', orient='records', lines=True)
    test_df.to_json('unuv_test.json', orient='records', lines=True)

In [75]:
known_labels, unknown_labels = create_df_known_labels_unuv(pool, unknown_nouns, unknown_verbs)
unknown_noun_unknown_verb(known_labels, unknown_labels)

size of train_df:  58092
size of test_df:  24897
size of test_df:  27859
