In [1]:

import codecs
import json
import configparser
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from dotenv import load_dotenv
from sklearn import linear_model

%matplotlib inline

import dask.array as da
import dask.dataframe as dd
import h5py

import spacy
import lemminflect
sp = spacy.load("en")
sp.Defaults.stop_words.add("left")
sp.Defaults.stop_words.add("right")


import random
random.seed(42)

from time import localtime, strftime

from sklearn.utils import shuffle

In [2]:
ID_FEATS = 3

In [3]:
# Load up config file (needs path; adapt env var if necessary); local imports
load_dotenv()

# load config file, set up paths, make project-specific imports
config_path = os.getenv('VISCONF')
if not config_path:
    # try default location, if not in environment
    default_path_to_config = '../Config/default.cfg'
    if os.path.isfile(default_path_to_config):
        config_path = default_path_to_config

assert config_path is not None, 'You need to specify the path to the config file via environment variable VISCONF.'        

config = configparser.ConfigParser()
with codecs.open(config_path, 'r', encoding='utf-8') as f:
    config.read_file(f)

corpora_base = config.get('DEFAULT', 'corpora_base')
preproc_path = config.get('DSGV-PATHS', 'preproc_path')
dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')

sys.path.insert(0,dsgv_home + "/Utils")
from utils import icorpus_code, plot_labelled_bb, get_image_filename, query_by_id
from utils import plot_img_cropped, plot_img_ax, invert_dict, get_a_by_b, get_image_part
sys.path.insert(0,dsgv_home + "/WACs/WAC_Utils")
from wac_utils import create_word2den, is_relational, filter_refdf_by_filelist, filter_relational_expr
from wac_utils import filter_X_by_filelist, make_mask_matrix, make_X_id_index, train_this_word
from wac_utils import get_X_for_word

from data_utils import load_dfs

from apply_utils import apply_wac_set_matrix, logreg

#sys.path.append(dsgv_home + '/Preproc')

In [4]:
# Load up preprocessed DataFrames. Slow!
# These DataFrames are the result of pre-processing the original corpus data,
# as per dsg-vision/Preprocessing/preproc.py

df_names = ['refcoco_refdf', 'refcocoplus_refdf']
df = load_dfs(preproc_path, df_names)

In [5]:
with open(preproc_path + '/refcoco_splits.json', 'r') as f:
    rc_splits = json.load(f)

## RefCoco

In [6]:
# mscoco_bbdf_pattern = '/Volumes/BigData_SSD/Data/Computed/ExtractOut/vgg/mscoco_bbdf_vgg19-fc2/mscoco_bbdf_vgg19-fc2_%d.hdf5'
# model_path_prefix = '../TrainWACs/ModelsOut/01_refcoco_vgg'
mscoco_bbdf_pattern = '../../data/Models/ForBToma/mscoco_bbdf_rsn50-max/mscoco_bbdf_rsn50-max_%d.hdf5'
model_path_prefix = '../../data/Models/ForBToma/01_refcoco_rsn'

### Load Image Features

In [7]:
das = []
fhs = []
for n in range(1,8):
    f = h5py.File(mscoco_bbdf_pattern % (n), 'r')
    fhs.append(f)
    das.append(da.from_array(f['img_feats'], chunks=(1000, 4106)))

In [8]:
X = da.concatenate(das)

In [9]:
X.shape

(602408, 2058)

In [10]:
rc_all_test = rc_splits['testA'] + rc_splits['testB'] # rc_splits['val']  # rc_splits['testA'] + rc_splits['testB']
X_ts = filter_X_by_filelist(X, rc_all_test)
refdf_test = filter_refdf_by_filelist(df['refcoco_refdf'], rc_all_test)

In [11]:
# this is small enough to be fully in memory
X_ts = X_ts[:].compute()

In [12]:
# Meaning that I can already close the file handles
for fh in fhs:
    fh.close()

In [13]:
word2den_ts = create_word2den(refdf_test)
X_idx_ts = make_X_id_index(X_ts)
mask_matrix_ts = make_mask_matrix(X_ts, X_idx_ts, word2den_ts, word2den_ts.keys())

### Load WACs

In [14]:
def exp2indseq(w2i, exp):
    return [word2ind[w] for w in exp.split() if w in word2ind] #the list of indices for each "word" in exp

In [15]:
def imageid2rows(idx, ic, ii):
    '''return all regions that belong to an image, as indices into X (via idx)'''
    # or should this be a separate dictionary?
    return [v for k,v in idx.items() if k[0] == ic and k[1] == ii]

In [16]:
with h5py.File(model_path_prefix + '.hdf5', 'r') as f:
    wacs = f['wac_weights'][:]   # slice, to actually read into memory (as ndarray)

In [19]:
with codecs.open(model_path_prefix + '.json', 'r') as f:
    modelpars, wordlist = json.load(f)

### Apply all WACs to all regions

In [20]:
all_applied = apply_wac_set_matrix(X_ts[:, ID_FEATS:], wacs.T, net=logreg) #applies all wac and returns a matrix of results, rows are entities and columns are words

In [21]:
word2ind = {w[0]:n for n,w in enumerate(wordlist)} #creates dictionary from word to its index in the wordlist

In [22]:
def eval_corpus(rfdf, X, idx, w2i, all_applied):
    out = []
    for n, row in rfdf.iterrows():
        ic, ii, ri, refexp = row['i_corpus image_id region_id refexp'.split()]
        #if is_relational(refexp)
        all_regs = imageid2rows(idx, ic, ii) #all regions that belong to an image
        this_exp_seq = exp2indseq(w2i, refexp) #the indices in the wordlist for each word in the referring expression
        all_regs_applied = np.prod(all_applied[all_regs][:, this_exp_seq], axis=1) #the product for the words in the ref exp
        regions_ranked = np.array(all_regs)[np.argsort(all_regs_applied)[::-1]] # ranked list of region products 
        try:
            this_rank = np.where(X[regions_ranked][:, ID_FEATS-1] == ri)[0][0] # returns highest ranked region
        except:
            print(ic, ii, ri, '\t region not in X?')    #unless it's not there, then they put this
        out_of = len(all_regs)  # the number of regions
        out.append((ic, ii, ri, refexp, is_relational(refexp), this_rank, out_of,
                    len(this_exp_seq) / len(refexp.split()) ))  #add this to the output, as well as the % of words in the vocab
    return pd.DataFrame(out, columns='i_corpus image_id region_id refexp is_rel rank n_obj perc_cov'.split())   #return it all as a dataframe

In [23]:
#dependency parse each refexp to make generation a bit easier hopefully
refdf_test["DepParse"] = refdf_test.apply(lambda row: sp(row.refexp), axis=1)

In [24]:
#this function will go through and create some sort of representation of every referring expression (and hopefully also every word?) per picture
ref_dict = {}
for i in refdf_test["image_id"].unique():
    ref_dict[i] = {refdf_test["refexp"][j] : j for j in refdf_test[refdf_test["image_id"] == i].index}
ref_vocab = defaultdict(set)
for k, v in ref_dict.items():
    for exp in v:
        for word in refdf_test.loc[v[exp]]["DepParse"]:
            if word.text in word2ind and not word.is_stop:
                ref_vocab[k].add(word.text)

In [25]:
def get_good_nps(image_id):
    nps = set()
    for k, v in ref_dict[image_id].items():
        for chunk in refdf_test.loc[v]["DepParse"].noun_chunks:
            if chunk.root.text in word2ind and not chunk.root.is_stop:
                nps.add(" ".join([token._.inflect("NN") if token == chunk.root else token.text for token in chunk if token.text != "the"]))
    return nps

In [26]:
image_ids = list(refdf_test.image_id.unique())
ref_dict_per_id = [ref_dict[x] for x in image_ids]
ref_voc_per_id = [ref_vocab[x] for x in image_ids]
good_nps_per_id = [get_good_nps(x) for x in image_ids]
im_ref_df = pd.DataFrame({"image_id": image_ids, "ref_exps": ref_dict_per_id, "vocab": ref_voc_per_id, "NPs": good_nps_per_id}).set_index("image_id")


In [27]:
im_ref_df.head()

Unnamed: 0_level_0,ref_exps,vocab,NPs
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
581563,"{'lower left corner darkness': 0, 'bpttom left...","{dark, corner, lower, black, van}",{black van}
581518,"{'top donut': 8, 'top of donuts': 7, 'donut wc...","{second, donut, middle, sprinkles}","{sprinkle, donut, top donut, middle donut}"
581346,"{'left zebra butt': 13, 'zebra on the left': 1...","{zebra, butt}","{zebra, right zebra butt, zebra right butt, ze..."
581282,"{'person bottom left': 18, 'left black shirt':...","{shirt, person, black}",{black shirt}
580668,"{'man on right': 20, 'right man': 21, 'man sit...","{man, person, yellow, car, white, bench, sitti...","{left car, man, person, right man, bench, whit..."


In [28]:
all_nps = set().union(*good_nps_per_id)

In [35]:
def generate_true_exp(rfdf, idx: """image id""", n: """maximum number of statements per image"""):
    #this function will generate true sentences containing refexp for a given image
    truths = []
    eligible_NPs = list(rfdf.loc[idx]["NPs"])
    random.shuffle(eligible_NPs)
    while len(truths) < n:
        if not eligible_NPs:
            break
        np = eligible_NPs.pop()
        if np.startswith(('a','e','i','o','u')):
            article = "an"
        else:
            article = "a"
        truths.append(f"There is {article} {np}.")
    return truths


def generate_false_exp(rfdf, every_np, idx: """image id""", n: """number of statements per image"""):
    #this function will generate false sentences (hopefully) for a given image
    lies = []
    eligible_NPs = list(every_np)
    random.shuffle(eligible_NPs)
    for np in eligible_NPs:
        if len(lies) >= n:
            break
        elif any([word in rfdf.loc[idx]["vocab"] for word in np.split()]):
            continue
        else:
            if np.startswith(('a','e','i','o','u')):
                article = "an"
            else:
                article = "a"
            lies.append(f"There is {article} {np}.")
    return lies

In [36]:
pos = [generate_true_exp(im_ref_df, x, 2) for x in im_ref_df.index.to_series()]
neg = [generate_false_exp(im_ref_df, all_nps, x, 2) for x in im_ref_df.index.to_series()]
im_ref_df["Positives"] = pos
im_ref_df["Negatives"] = neg

In [37]:
im_ref_df.head()

Unnamed: 0_level_0,ref_exps,vocab,NPs,Positives,Negatives
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
581563,"{'lower left corner darkness': 0, 'bpttom left...","{dark, corner, lower, black, van}",{black van},[There is a black van.],"[There is a screen., There is a cow.]"
581518,"{'top donut': 8, 'top of donuts': 7, 'donut wc...","{second, donut, middle, sprinkles}","{sprinkle, donut, top donut, middle donut}","[There is a top donut., There is a donut.]","[There is a rightmost person., There is a dark..."
581346,"{'left zebra butt': 13, 'zebra on the left': 1...","{zebra, butt}","{zebra, right zebra butt, zebra right butt, ze...","[There is a zebra right butt., There is a righ...","[There is a center person., There is an orange..."
581282,"{'person bottom left': 18, 'left black shirt':...","{shirt, person, black}",{black shirt},[There is a black shirt.],"[There is a red chair., There is a front bird.]"
580668,"{'man on right': 20, 'right man': 21, 'man sit...","{man, person, yellow, car, white, bench, sitti...","{left car, man, person, right man, bench, whit...","[There is a left car., There is a bench.]","[There is a brown teddy bear., There is a red ..."


In [None]:
new_eval_corpus(rfdf, X, idz, w2i, all_applied):
    pass
    # go through and get classifications for each bounding box -- ideally for a specific expression but maybe for all of them?
    # for each picture (and each expression? or just the one of interest for that pic) create a set of bounding box entities described by that expression (if boolean) or a list of probabilities
    # then use the n for quantification that you're looking at and either produce a boolean based on whether there are at least n entities in the set, or use the prob_at_least function from semparse to assess the probability of there being at least n entities based on the probabilities for each entity
    #produce a new dataframe with pictures, expressions, and booleans/probabilities (as well as the perc_cov and is_rel values as before)

In [None]:
outdf = eval_corpus(refdf_test, X_ts, X_idx_ts, word2ind, all_applied)

In [None]:
outdf.head()

In [None]:
np.sum(outdf['rank'] == 0) / len(outdf)

In [None]:
np.sum(outdf['rank'] < 3) / len(outdf)

In [None]:
def score_outdf_(subdf):
    print('accuracy @1: {:.2}'.format(np.sum(subdf['rank'] == 0) / len(subdf))) #accuracy at correct entity being the top one
    print('accuracy @3: {:.2}'.format(np.sum(subdf['rank'] < 3) / len(subdf))) #accuracy at correct entity being in top three
    print('mean reciprocal rank: {:.2}'.format(np.mean(1 / (subdf['rank'] + 1)))) #1.0 would be always first, 0.0 is never first
    print('random baseline: {:.3}'.format(1 / np.mean(subdf['n_obj']))) #random baseline for comparison


def score_outdf(outdf):
    print('** full')
    score_outdf_(outdf)
    print('** NR')  # omits relative ones
    score_outdf_(outdf[outdf['is_rel'] == False])
    print('** NR, cov > 0.5')   #omits ones with more than half the words not in our classifiers
    score_outdf_(outdf[(outdf['is_rel'] == False) & (outdf['perc_cov'] >= 0.5)])