In [1]:
import re
import random

import dask
import dask.array as da
import numpy as np
from nltk.corpus import stopwords

import corpus
import utterances

In [2]:
big_dfs = corpus.get_corpus_dfs("refcocoplus")
applied = corpus.get_classifications(big_dfs, "refcocoplus", which_split="train")
region_data, all_refexp = corpus.get_region_info(big_dfs, "refcocoplus")
all_applied, word2ind, X_idx, wordlist = applied["refcocoplus"]
refexp_list = [exp.split() for k, v in all_refexp.items() for exp in v]

In [3]:
pos_replacements = utterances.generate_pos_replacements(word2ind)
len(pos_replacements["NN"]), len(pos_replacements["JJ"])

(374, 139)

In [4]:
#spellcheck stuff
refexp_list = [re.sub(r"\st shirt\s", "tshirt", re.sub(r"\so clock\s", "o'clock", " ".join(x))) for x in refexp_list]

In [5]:
ic, ii = random.choice(list(all_refexp.keys()))
ic, ii

(1, 57185)

In [6]:
ex_exp = random.choice(list(all_refexp[(ic, ii)].keys()))
ri = all_refexp[(ic,ii)][ex_exp]
ri, ex_exp

(555404, 'shorts not running but partially bending over')

In [7]:
baseline = utterances.generate_baseline_utterances(pos_replacements)
len(baseline)

47426

In [8]:
specific = utterances.generate_specific_utterances(ex_exp, pos_replacements, word2ind)
if specific:
    for output in specific:
        baseline.add(frozenset(output))
len(baseline)

117178

In [9]:
def regionid2row(idx, ic, ii, ri):
    return idx[(ic, ii, ri)]

def exp2indseq(exp):
    return [word2ind[w] for w in exp.split() if w in word2ind]

stop_words = set(stopwords.words("english"))

In [38]:
regions = [ri for ri in region_data[(ic, ii)]]
regions_check = {regionid2row(X_idx, ic, ii, r): r for r in regions}
rows = list(regions_check.keys())
rows

[24796, 44046, 48334, 48820, 48938, 123693, 161785]

In [11]:
split_exp = [word for word in ex_exp.split() if word in word2ind and word not in stop_words]
split_exp

['shorts', 'running', 'partially', 'bending']

In [39]:
regions_check[rows[np.argmax(np.prod(all_applied[rows,:][:, exp2indseq(" ".join(split_exp))], axis=1)).compute()]]

1711478

# Start Here

In [47]:
baseline = list(baseline)
exp_ind_sequences = [exp2indseq(" ".join(x)) for x in baseline] #this maps from NLP stuff into the rows in the existing dask array I need
n_exp = len(exp_ind_sequences)
n_exp # this is the number of combinations of pre-computer classifiers I'm attempting to use 

117178

In [48]:
exp_arr = np.array(exp_ind_sequences,dtype=object)
exp_arr

array([list([452, 214]), list([333, 3]), list([332, 14]), ...,
       list([21, 229, 248, 315]), list([229, 290, 298, 21]),
       list([327, 201])], dtype=object)

In [49]:
def read_one_exp(ind_seq):
    return all_applied[:, ind_seq].prod(axis=1)

sample = read_one_exp(exp_arr[1,])
sample

Unnamed: 0,Array,Chunk
Bytes,1.27 MiB,4.48 kiB
Shape,"(166152,)","(574,)"
Count,9693 Tasks,605 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.27 MiB 4.48 kiB Shape (166152,) (574,) Count 9693 Tasks 605 Chunks Type float64 numpy.ndarray",166152  1,

Unnamed: 0,Array,Chunk
Bytes,1.27 MiB,4.48 kiB
Shape,"(166152,)","(574,)"
Count,9693 Tasks,605 Chunks
Type,float64,numpy.ndarray


In [50]:
read_them = dask.delayed(read_one_exp)
lazy_list = [read_them(x) for x in exp_arr]
sample = lazy_list[0].compute()
arrays = [da.from_delayed(lazy_arr, dtype=sample.dtype, shape=sample.shape) for lazy_arr in lazy_list]
stack = da.stack(arrays, axis=1)
stack

Unnamed: 0,Array,Chunk
Bytes,145.06 GiB,1.27 MiB
Shape,"(166152, 117178)","(166152, 1)"
Count,351534 Tasks,117178 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 145.06 GiB 1.27 MiB Shape (166152, 117178) (166152, 1) Count 351534 Tasks 117178 Chunks Type float64 numpy.ndarray",117178  166152,

Unnamed: 0,Array,Chunk
Bytes,145.06 GiB,1.27 MiB
Shape,"(166152, 117178)","(166152, 1)"
Count,351534 Tasks,117178 Chunks
Type,float64,numpy.ndarray


In [51]:
image_substack = stack[rows]
image_substack

Unnamed: 0,Array,Chunk
Bytes,6.26 MiB,56 B
Shape,"(7, 117178)","(7, 1)"
Count,468712 Tasks,117178 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.26 MiB 56 B Shape (7, 117178) (7, 1) Count 468712 Tasks 117178 Chunks Type float64 numpy.ndarray",117178  7,

Unnamed: 0,Array,Chunk
Bytes,6.26 MiB,56 B
Shape,"(7, 117178)","(7, 1)"
Count,468712 Tasks,117178 Chunks
Type,float64,numpy.ndarray


In [52]:
col_sums = image_substack.sum(axis=0, keepdims=True)
new_stack = image_substack / col_sums
new_stack

Unnamed: 0,Array,Chunk
Bytes,6.26 MiB,56 B
Shape,"(7, 117178)","(7, 1)"
Count,820246 Tasks,117178 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.26 MiB 56 B Shape (7, 117178) (7, 1) Count 820246 Tasks 117178 Chunks Type float64 numpy.ndarray",117178  7,

Unnamed: 0,Array,Chunk
Bytes,6.26 MiB,56 B
Shape,"(7, 117178)","(7, 1)"
Count,820246 Tasks,117178 Chunks
Type,float64,numpy.ndarray


In [53]:
row_sums = new_stack.sum(axis=1, keepdims=True)
newer_stack = new_stack / row_sums
newer_stack

Unnamed: 0,Array,Chunk
Bytes,6.26 MiB,56 B
Shape,"(7, 117178)","(7, 1)"
Count,1093665 Tasks,117178 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.26 MiB 56 B Shape (7, 117178) (7, 1) Count 1093665 Tasks 117178 Chunks Type float64 numpy.ndarray",117178  7,

Unnamed: 0,Array,Chunk
Bytes,6.26 MiB,56 B
Shape,"(7, 117178)","(7, 1)"
Count,1093665 Tasks,117178 Chunks
Type,float64,numpy.ndarray


In [54]:
final_col_sums = newer_stack.sum(axis=0, keepdims=True)
final_stack = newer_stack / final_col_sums
final_stack

Unnamed: 0,Array,Chunk
Bytes,6.26 MiB,56 B
Shape,"(7, 117178)","(7, 1)"
Count,1445199 Tasks,117178 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.26 MiB 56 B Shape (7, 117178) (7, 1) Count 1445199 Tasks 117178 Chunks Type float64 numpy.ndarray",117178  7,

Unnamed: 0,Array,Chunk
Bytes,6.26 MiB,56 B
Shape,"(7, 117178)","(7, 1)"
Count,1445199 Tasks,117178 Chunks
Type,float64,numpy.ndarray


In [56]:
exp_idx = baseline.index(frozenset(split_exp))

In [59]:
answer = final_stack[:,exp_idx].argmax(axis=0)

In [60]:
answer.compute()