### Load MSCOCO Data

In [1]:
from refer import REFER
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import os
from PIL import Image as PImage # pillow
import torch
import cv2

In [2]:
data_root = 'coco'  # contains refclef, refcoco, refcoco+, refcocog and images
dataset = 'refcoco' 
splitBy = 'unc'
refer = REFER(data_root, dataset, splitBy)

loading dataset refcoco into memory...
creating index...
index created.
DONE (t=5.40s)


### Load the SAM Model

In [3]:
import sys
sys.path.append("..")
from segment_anything import sam_model_registry, SamPredictor

sam_checkpoint = "sam_vit_h_4b8939.pth"
model_type = "vit_h"

device = "cuda"

sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)

predictor = SamPredictor(sam)

In [4]:
def show_mask(mask, ax, random_color=False):
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30/255, 144/255, 255/255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)
    
def show_points(coords, labels, ax, marker_size=375):
    pos_points = coords[labels==1]
    neg_points = coords[labels==0]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)   
    
def show_box(box, ax):
    x0, y0 = box[0], box[1]
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2)) 

### Prepare the images

In [5]:
def image_prep(img_id, ann_id):
    
    #get image and bounding box
    img = refer.Imgs[img_id]
    bb = refer.Anns[ann_id]['bbox']
    fname = os.path.join(refer.IMAGE_DIR, img['file_name'])
    
    #load image 
    image = cv2.imread(fname)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    #load image into SAM
    predictor.set_image(image)

    #find midpoint and create points to load into SAM
    bbox = refer.Anns[ann_id]['bbox']
    bbox = [int(b) for b in bbox]
    x = (bbox[0] + (bbox[2]/2))
    y = (bbox[1] + (bbox[3]/2))
    input_point = np.array([[x, y]])
    input_label = np.array([1])
    #bbox: [x_min, y_min, width, height]
    input_box = np.array([bbox[0], bbox[1], bbox[0]+bbox[2], bbox[1]+bbox[3]]) #xyxy format
    
    #get and apply mask
    masks, scores, _ = predictor.predict(point_coords=input_point, point_labels=input_label,  box=input_box, multimask_output=True)
    index_max = np.argmax(scores)
    image[~masks[index_max],:] = [255,255,255]
    
    #Display masks 
    #for i, (mask, score) in enumerate(zip(masks, scores)):
    #    plt.figure(figsize=(10,10))
    #    plt.imshow(image)
    #    show_mask(mask, plt.gca())
    #    show_box(input_box, plt.gca())
    #    show_points(input_point, input_label, plt.gca())
    #    plt.title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18)
    #    plt.axis('off')
    #    plt.show()
        
    #normalize image for processing
    xs = 224
    ys = 224
    if len(image) == 0: return None
    pim = PImage.fromarray(image)
    pim2 = pim.resize((xs,ys), PImage.Resampling.LANCZOS)
    img = np.array(pim2)
    
    if len(img.shape) < 3: return None
    
    img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
    
    return pim, img

### Train the model 

In [10]:
import clip
#device = "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load('ViT-B/32', device)
from collections import defaultdict as dd
from tqdm import tqdm_notebook as tqdm
import random

In [11]:
words_as_classifiers = dd(list) # use something like this dictionary to store positive examples
train_ids = refer.getRefIds(split='train')

for i in tqdm(train_ids):#[:1000]):
    # first, get all of the training dat
    ref = refer.Refs[i]
    
    # for a single train_id, you can get its image_id and the ann_id (i.e., the referring expression)
    img_id = ref['image_id']
    ann_id = ref['ann_id']
    
    img, _ = image_prep(img_id, ann_id)
    #prepare image to pass to clip
    if (img is not None):
        #img, _ = image_prep(img_id, ann_id)
    
        #then, you'll need to pass that image through a convnet like you did for A6
        img = preprocess(img).unsqueeze(0).to(device)
        enc_img = clip_model.encode_image(img)
    
        #optionally, you can call the compute_posfeats function to get some additional features
        #concatenate these to the convnet output to form a single vector for this image
        #pos_feats = compute_posfeats(img_id, ann_id)
        feature_vector = enc_img.detach().cpu().numpy()

        #add this feature vector to a list of positive examples for each word in the referring expression
        # you may need to flatten() the feature vector
        for sent in ref['sentences']:
            for word in sent['tokens']:
                words_as_classifiers[word].append(feature_vector)


#for word in words_as_classifiers:
#   print(word, len(words_as_classifiers[word]))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(train_ids):#[:1000]):


  0%|          | 0/42404 [00:00<?, ?it/s]

In [12]:
def find_negative_samples(words_as_classifiers, word):
    words = list(words_as_classifiers.keys())
    words.remove(word)
    random_word = random.choice(words)
    random_vector = random.choice(words_as_classifiers[random_word])
    return random_vector

In [13]:
# now that we have all of the positive examples for all of the words, we  need to find negative examples for each word

num_negatives = 2 
threshold = 4

wac = {}

for word in tqdm(words_as_classifiers):
    pos_vectors = words_as_classifiers[word]
    num_pos_vectors = len(pos_vectors)
    if num_pos_vectors < threshold:
        continue
#     print(word, num_pos_vectors)
    neg_vectors = []
    # the number of negative examples should be a function of how many positive examples there are
    for i in range(0,num_negatives*num_pos_vectors):
        neg_vectors.append(find_negative_samples(words_as_classifiers,word))
    neg_vectors = np.array(neg_vectors)
    pos_vectors = np.array(pos_vectors)
    neg_vectors = neg_vectors.reshape(neg_vectors.shape[0], neg_vectors.shape[2])
    pos_vectors = pos_vectors.reshape(pos_vectors.shape[0], pos_vectors.shape[2])
#     print(pos_vectors.shape, neg_vectors.shape)
    X = np.concatenate((pos_vectors, neg_vectors), axis=0)
    
    y = np.concatenate((np.ones(num_pos_vectors), np.zeros(len(neg_vectors))))
    wac[word] = (X,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for word in tqdm(words_as_classifiers):


  0%|          | 0/9364 [00:00<?, ?it/s]

In [14]:
import pickle

In [15]:
with open('wac_li_Model.pickle', 'wb') as f:
    pickle.dump(wac, f, pickle.HIGHEST_PROTOCOL)

In [16]:
with open('wac_li_Model.pickle', 'rb') as f:
    wac = pickle.load(f)

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
# finally, train a binary classifier for each word
for word in wac:
    clfr = LogisticRegression(C=0.25, max_iter=1000)
    X,y = wac[word]
    clfr.fit(X,y)
    wac[word] = clfr

### Validate

In [19]:
eval_ids = refer.getRefIds(split='val')

In [20]:
def process_subimage(bbox, img, img_id, ann_id, xs=224,ys=224): 
    img, _ = image_prep(img_id, ann_id)
    img = preprocess(img).unsqueeze(0).to(device)
    enc_img = clip_model.encode_image(img)
    feature_vector = enc_img.detach().cpu().numpy()
    return feature_vector

In [21]:
def evaluate():
    score = 0
    total = 0 
    # step through the eval ids
    for i in tqdm(eval_ids):#[:10]):
   
        ref_id = i
        ref = refer.Refs[ref_id]
        #this is the gold annotation id for all of the sentences
        ann_id = ref['ann_id']  
        img_id = ref['image_id']
        img = refer.Imgs[img_id]
        # objs is a list of all of the object annotations for the image, including the gold
        objs = refer.imgToAnns[img_id] 
    
   
        features = {}
        for obj in objs:
            # object as feature vector
            features[obj['id']] = process_subimage(obj['bbox'], img, img_id, obj['id'])

        # apply all of the feature vectors to your trained classifiers for each word in the sentence
        for sent in ref['sentences']:
            total += 1
            pval = {oid: 1 for oid in features}
            for oid in features:
                feature = features[oid]
                if feature is not None:
                    for word in sent['tokens']: 
                        if (word in wac):
                            # multiply the classifier probabilities together for each word
                            pval[oid] *= wac[word].predict_proba(feature)[0][1]
                else:
                  pval[oid] = 0
         
            # find the object with the highest resulting multiplied probability, compare to gold 
            most_probable = max(pval, key=pval.get)
            if (most_probable == ann_id):
                score += 1   
    
    #return accuracy
    return score/total

In [22]:
validate_score = evaluate()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(eval_ids):#[:10]):


  0%|          | 0/3811 [00:00<?, ?it/s]

In [23]:
validate_score

0.5771644821857117

In [24]:
eval_ids = refer.getRefIds(split='test')

In [25]:
evaluate_score = evaluate()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(eval_ids):#[:10]):


  0%|          | 0/3785 [00:00<?, ?it/s]

In [26]:
evaluate_score

0.5825892857142857