In [1]:
import json
from collections import Counter


#Validation Set
with open('val.json') as json_file:
    val = json.load(json_file)
episodes = val['episodes']
type(episodes)

list

In [2]:
question_vocab_val = val['question_vocab']
answer_vocab_val = val['answer_vocab']


In [3]:
unique_answer_vocab = set(a for a in answer_vocab_val['word_list'])
len(unique_answer_vocab)

35

In [4]:
# Check for Unknowns
qs = [item['question'] for item in episodes]
qt = [item['question_text'] for item in qs]

for q in qt:
    if '<unk>' in q:
        print(q)
        
ans = [item['answer_text'] for item in qs]
for a in ans:
    if '<unk>' in a:
        print(a)

In [5]:
#Unique Answers
unique_as = set(a for a in ans)
unique_as

{'bedroom',
 'black',
 'blue',
 'brown',
 'closet',
 'family room',
 'green',
 'grey',
 'gym',
 'hallway',
 'kitchen',
 'laundry room',
 'living room',
 'lounge',
 'off-white',
 'olive green',
 'red',
 'red brown',
 'silver',
 'slate grey',
 'spa',
 'tan',
 'white',
 'yellow'}

In [6]:
#Train Set
with open('train.json') as json_file:
    train = json.load(json_file)
train_episodes = train['episodes']
type(train_episodes)

list

In [7]:
answer_vocab_train = train['answer_vocab']
question_vocab_train = train['question_vocab']

In [8]:
# Check for Unknowns
tqs = [item['question'] for item in train_episodes]
tqt = [item['question_text'] for item in tqs]


for tq in tqt:
    if '<unk>' in tq:
        print(tq)
        
tans = [item['answer_text'] for item in tqs]
for ta in tans:
    if '<unk>' in ta:
        print(ta)

In [9]:
split_tqs = [tq.replace("?","").split() for tq in tqt]
training_question_vocab = set([item for sublist in split_tqs for item in sublist])
#training_question_vocab

In [10]:
split_qs = [q.replace("?","").split() for q in qt]
val_question_vocab = set([item for sublist in split_qs for item in sublist])
#val_question_vocab

In [11]:
#objects only occuring in validation
val_only_vocab = val_question_vocab - training_question_vocab
val_only_vocab

{'toaster'}

In [12]:
#Unique Answers
unique_tas = set(ta for ta in tans)
unique_tas

{'bathroom',
 'bedroom',
 'black',
 'blue',
 'brown',
 'closet',
 'dining room',
 'family room',
 'foyer',
 'green',
 'grey',
 'hallway',
 'kitchen',
 'laundry room',
 'light blue',
 'living room',
 'lounge',
 'off-white',
 'office',
 'olive green',
 'orange yellow',
 'purple',
 'purple pink',
 'red',
 'red brown',
 'silver',
 'slate grey',
 'spa',
 'tan',
 'tv room',
 'white',
 'yellow',
 'yellow green',
 'yellow pink'}

In [13]:
color_room_answers = []
color_answers = []
location_answers = []

for item in tqs:
    if item['question_type'] == 'color_room':
        color_room_answers.append(item['answer_text'])
    elif item['question_type'] == 'color':
        color_answers.append(item['answer_text'])
    elif item['question_type'] == 'location':
        location_answers.append(item['answer_text'])
    else:
        print('uh oh')

In [14]:
print('color_room', len(set(color_room_answers)))
print('color', len(set(color_answers)))
print('location', len(set(location_answers)))

color_room 20
color 11
location 14


In [15]:
val_only = set(unique_as - unique_tas)
val_only

{'gym'}

In [16]:
# Get Percentages of Question Types
tqtype = [item['question_type'] for item in tqs]
q_num = len(tqtype)
c = Counter(tqtype)
type_percentages = [(i, c[i] / q_num * 100.0) for i in c]
type_percentages

[('color_room', 69.85908141962422),
 ('color', 15.91858037578288),
 ('location', 14.222338204592901)]

In [17]:
vqtype = [item['question_type'] for item in qs]
q_num = len(vqtype)
c = Counter(vqtype)
type_percentages = [(i, c[i] / q_num * 100.0) for i in c]
type_percentages
#c

[('color_room', 68.46153846153847),
 ('color', 17.692307692307693),
 ('location', 13.846153846153847)]

In [18]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler 

In [19]:
# Boosting the training set
train_episodes = train['episodes']
print(len(train_episodes))
y = [item['question']['question_type'] for item in train_episodes]
X = pd.DataFrame.from_dict(train_episodes)
print('Original dataset shape %s' % Counter(y))
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

11496
Original dataset shape Counter({'color_room': 8031, 'color': 1830, 'location': 1635})
Resampled dataset shape Counter({'color_room': 8031, 'color': 8031, 'location': 8031})


In [20]:
eps = pd.Series(range(0,len(X_res)))
X_res['episode_id'] = eps
X_res = X_res.to_dict(orient="records")

In [21]:
new_data = {'episodes': X_res, 'question_vocab': question_vocab_train, 'answer_vocab': answer_vocab_train}
with open('boost_train.json', 'w') as f:
    json.dump(new_data, f)

KeyboardInterrupt: 

In [None]:
#Making a mini-version of the dataset for local testing
episodes[50]

In [None]:
# matterport scenes to use in mini-dataset
scenes = ['mp3d/1LXtFkjw3qL/1LXtFkjw3qL.glb', 'mp3d/1pXnuDYAj8r/1pXnuDYAj8r.glb', 'mp3d/2azQ1b91cZZ/2azQ1b91cZZ.glb', 'mp3d/2n8kARJN3HM/2n8kARJN3HM.glb']

In [None]:
mini_val_episodes = list(filter(lambda d: d['scene_id'] in scenes, episodes))[:3]
len(mini_val_episodes)

In [None]:
mini_val = {'episodes': mini_val_episodes, 'question_vocab': question_vocab_val, 'answer_vocab': answer_vocab_val}

In [None]:
mini_train_episodes = list(filter(lambda d: d['scene_id'] in scenes, train_episodes))[:3]
mini_train = {'episodes': mini_train_episodes, 'question_vocab': question_vocab_train, 'answer_vocab': answer_vocab_train}

In [None]:
len(mini_train_episodes)

In [None]:
#write to json files

with open('mini_train.json', 'w') as f:
    json.dump(mini_train, f)
    
with open('mini_val.json', 'w') as f:
    json.dump(mini_val, f)

In [22]:
len(episodes[10]['shortest_paths'])

1