Computing Projection - sanity check

In [20]:
import numpy as np
from scipy.linalg import lstsq

images = np.array([[1,1,1], [0,0,0]])

text = np.array([[2,2,2], [0,0,0]])

proj, _, _, _ = lstsq(text, images)

proj
# np.dot(text,proj)

array([[0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667],
       [0.16666667, 0.16666667, 0.16666667]])

In [21]:
from data_loader import dataloaders as dataloader
from tqdm.notebook import trange, tqdm
from classes import CLASSES, CUSTOM_TEMPLATES, GENERIC_PROMPT_COLLECTIONS

In [22]:
# Load Image Data

d = dataloader.load_data(
    data_root="./datasets/ImageNet/",
    dataset="ImageNet_LT",
    phase="train",
    batch_size=128,
#     batch_size=1,
    sampler_dic=None,
    num_workers=12,
    type="random_prompts",
    prompt_set="ImageNet",
)
data = d[0]

Loading data from /nethome/bdevnani3/flash1/long_tail_lang/data/ImageNet_LT/ImageNet_LT_train.txt
Use data transformation: Compose(
    RandomResizedCrop(size=(224, 224), scale=(0.5, 1), ratio=(0.75, 1.3333), interpolation=bicubic)
    RandomHorizontalFlip(p=0.5)
    ColorJitter(brightness=[0.6, 1.4], contrast=[0.6, 1.4], saturation=[0.6, 1.4], hue=None)
    ToTensor()
    Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
)
***********************DATASET: train random_prompts
************************* dict_keys(['default', 'ImageNet', 'bestImageNet'])
train 115846
No sampler.
Shuffle is True.


In [23]:
import torch.nn as nn
from clip import clip
import os
import torch


# Initialize CLIP models 
class TextEncoder(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        self.transformer = clip_model.transformer
        self.positional_embedding = clip_model.positional_embedding
        self.ln_final = clip_model.ln_final
        self.text_projection = clip_model.text_projection
        self.dtype = clip_model.dtype
        self.token_embedding = clip_model.token_embedding

    def forward(self, text):
        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]

        x = x + self.positional_embedding.type(self.dtype)
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x).type(self.dtype)

        # x.shape = [batch_size, n_ctx, transformer.width]
        # take features from the eot embedding (eot_token is the highest number in each sequence)
        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection

        return x

def load_clip_to_cpu(visual_backbone):
    backbone_name = visual_backbone
    url = clip._MODELS[backbone_name]
    model_path = clip._download(url, os.path.expanduser("~/.cache/clip"))

    try:
        # loading JIT archive
        model = torch.jit.load(model_path, map_location="cpu").eval()
        state_dict = None

    except RuntimeError:
        state_dict = torch.load(model_path, map_location="cpu")

    model = clip.build_model(state_dict or model.state_dict())

    return model

clip_model = load_clip_to_cpu("RN50")

visual_model = torch.nn.DataParallel(clip_model.visual).cuda()

text_model = TextEncoder(clip_model)
text_model = torch.nn.DataParallel(text_model).cuda()

In [None]:
from tqdm.notebook import trange, tqdm

final_images, final_texts = [], []
final_labels = []

# count = {}

with torch.no_grad():
    for im, label, _, path in tqdm(data):
        x = visual_model(im.half()).float()
        x = x / x.norm(dim=-1, keepdim=True)
        final_images.append(x)

        templates = np.array(GENERIC_PROMPT_COLLECTIONS["ImageNet"])[path.cpu()]
        classnames_for_labels = np.array(CLASSES)[label.cpu()]

        texts = clip.tokenize(t.format(c) for t,c in zip(templates, classnames_for_labels))
        texts = texts.cuda()
        zeroshot_weights = text_model(texts).float()
        zeroshot_weights = zeroshot_weights / zeroshot_weights.norm(
            dim=-1, keepdim=True
        )
        final_texts.append(zeroshot_weights)
        final_labels.append(label)
#         count[label.item] +=1

  0%|          | 0/906 [00:00<?, ?it/s]

In [6]:
final_images = torch.cat(final_images, dim=0)
final_texts = torch.cat(final_texts, dim=0)
final_labels = torch.cat(final_labels, dim=0)

In [7]:
torch.save(final_images, "clip_embedded_images.pt")
torch.save(final_texts, "clip_embedded_texts.pt")
torch.save(final_labels, "clip_embedded_image_labels.pt")

In [8]:
import torch
final_images = torch.load("clip_embedded_images.pt")
final_texts = torch.load("clip_embedded_texts.pt")
final_labels = torch.load("clip_embedded_image_labels.pt")

In [9]:
final_images.shape

torch.Size([115846, 1024])

In [10]:
proj, _, _, _ = lstsq(final_texts.cpu(), final_images.cpu())
proj.shape
np.save("imagenet_text2img_proj.npy", proj)

Balanced projection matrix - 1 instance per class

In [11]:
balanced_images = np.zeros((1000, 1024))
balanced_texts = np.zeros((1000, 1024))

for i, label in tqdm(enumerate(final_labels)):
    balanced_images[label,:] = final_images[i,:].cpu()
    balanced_texts[label,:] = final_texts[i,:].cpu()

0it [00:00, ?it/s]

In [12]:
balanced_proj, _, _, _ = lstsq(balanced_texts, balanced_images)
balanced_proj.shape

(1024, 1024)

In [13]:
np.save("imagenet_text2img_balanced_proj.npy", balanced_proj.astype(float))

Balanced projection matrix - 100 instances per class

In [14]:
label_frequencies = {}
for label in final_labels:
    if label.item() not in label_frequencies:
        label_frequencies[label.item()] = 0
    label_frequencies[label.item()] += 1

indices_by_label = {}

for i, label in enumerate(final_labels):
    if label.item() not in indices_by_label:
        indices_by_label[label.item()] = []
    indices_by_label[label.item()].append(i)

In [15]:
import random 

upsampled_images = np.zeros((100000, 1024))
upsampled_texts = np.zeros((100000, 1024))

for i in tqdm(range(100000)):
    label = random.choice(range(1000))
    
    idx = random.choice(indices_by_label[label])
    upsampled_images[i] = final_images[idx].cpu()
    upsampled_texts[i] = final_texts[idx].cpu()


  0%|          | 0/100000 [00:00<?, ?it/s]

In [16]:
# upsampled_balanced_proj, _, _, _ = lstsq(upsampled_texts, upsampled_images)
# upsampled_balanced_proj.shape

In [17]:
# np.save("imagenet_text2img_upsampled_balanced_proj.npy", upsampled_balanced_proj.astype(float))

In [18]:
import random 

upsampled_images = np.zeros((400000, 1024))
upsampled_texts = np.zeros((400000, 1024))

for i in tqdm(range(400000)):
    label = random.choice(range(1000))
    
    idx = random.choice(indices_by_label[label])
    upsampled_images[i] = final_images[idx].cpu()
    upsampled_texts[i] = final_texts[idx].cpu()

# upsampled_balanced_proj, _, _, _ = lstsq(upsampled_texts, upsampled_images)
# upsampled_balanced_proj.shape

# np.save("imagenet_text2img_upsampled_balanced_proj400.npy", upsampled_balanced_proj.astype(float))

  0%|          | 0/400000 [00:00<?, ?it/s]

Balanced projection matrix - 100 instances/pooling of labels 

In [19]:
all_labels_text = {}

with torch.no_grad():
    for label in tqdm(range(1000)):
        all_labels_text[label] = []

        templates = np.array(GENERIC_PROMPT_COLLECTIONS["ImageNet"])
        c = np.array(CLASSES)[label]
            
        texts = clip.tokenize([template.format(c) for template in templates]) 
        texts = texts.cuda()
        zeroshot_weights = text_model(texts).float()
        zeroshot_weights = zeroshot_weights / zeroshot_weights.norm(
            dim=-1, keepdim=True
        )
        all_labels_text[label].append(zeroshot_weights)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
import random 

upsampled2_images = np.zeros((100000, 1024))
upsampled2_texts = np.zeros((100000, 1024))

for i in tqdm(range(100000)):
    label = random.choice(range(1000))
    
    idx = random.choice(indices_by_label[label])
    upsampled2_images[i] = final_images[idx].cpu()
    idx_2 = random.choice(range(82))
    upsampled2_texts[i] = all_labels_text[label][0][idx_2].cpu()

In [None]:
upsampled2_texts.shape

In [15]:
# upsampled2_balanced_proj, _, _, _ = lstsq(upsampled2_texts, upsampled2_images)
# upsampled2_balanced_proj.shape

(1024, 1024)

In [None]:
# np.save("imagenet_text2img_upsampled2_balanced_proj.npy", upsampled2_balanced_proj.astype(float))

In [None]:
import random 

upsampled2_images = np.zeros((200000, 1024))
upsampled2_texts = np.zeros((200000, 1024))

for i in tqdm(range(200000)):
    label = random.choice(range(1000))
    
    idx = random.choice(indices_by_label[label])
    upsampled2_images[i] = final_images[idx].cpu()
    idx_2 = random.choice(range(82))
    upsampled2_texts[i] = all_labels_text[label][0][idx_2].cpu()
    
# upsampled2_balanced_proj, _, _, _ = lstsq(upsampled2_texts, upsampled2_images)

# np.save("imagenet_text2img_upsampled2_balanced_proj200.npy", upsampled2_balanced_proj.astype(float))

In [None]:
import random 

upsampled2_images = np.zeros((400000, 1024))
upsampled2_texts = np.zeros((400000, 1024))

for i in tqdm(range(400000)):
    label = random.choice(range(1000))
    
    idx = random.choice(indices_by_label[label])
    upsampled2_images[i] = final_images[idx].cpu()
    idx_2 = random.choice(range(82))
    upsampled2_texts[i] = all_labels_text[label][0][idx_2].cpu()
    
# upsampled2_balanced_proj, _, _, _ = lstsq(upsampled2_texts, upsampled2_images)

# np.save("imagenet_text2img_upsampled2_balanced_proj400.npy", upsampled2_balanced_proj.astype(float))

Balanced projection matrix - 100 instances/pooling of labels/ VL-LTR text pool

In [None]:
desc_path = "/nethome/bdevnani3/flash1/long_tail_lang/data_loader/imagenet/wiki/desc_{}.txt"

all_labels_wiki_text = {}
all_labels_wiki_text_embs = {}

with torch.no_grad():
    for label in range(1000):
        all_labels_wiki_text[label] = []
        label_desc_path = desc_path.format(label)
        f = open(label_desc_path)
        for line in f:
            line = line.strip()
            if "==" in line:
                continue
            if len(line) == 0:
                continue
            all_labels_wiki_text[label].append(line[:76])
        texts = clip.tokenize(all_labels_wiki_text[label])
        texts = texts.cuda()
        zeroshot_weights = text_model(texts).float()
        zeroshot_weights = zeroshot_weights / zeroshot_weights.norm(
            dim=-1, keepdim=True
        )
        all_labels_wiki_text_embs[label] = zeroshot_weights
        f.close()
    

In [None]:
import random 

upsampled3_images = np.zeros((100000, 1024))
upsampled3_texts = np.zeros((100000, 1024))

for i in tqdm(range(100000)):
    label = random.choice(range(1000))
    
    idx = random.choice(indices_by_label[label])
    upsampled3_images[i] = final_images[idx].cpu()
    idx_2 = random.choice(range(len(all_labels_wiki_text[label])))
    upsampled3_texts[i] = all_labels_wiki_text_embs[label][idx_2].cpu()

In [None]:
# upsampled3_balanced_proj, _, _, _ = lstsq(upsampled3_texts, upsampled3_images)
# upsampled3_balanced_proj.shape

In [None]:
# np.save("imagenet_text2img_upsampled3_balanced_proj.npy", upsampled3_balanced_proj.astype(float))

Balanced projection matrix - 100 instances/pooling of labels/ VL-LTR text pool + templates

In [None]:
all_labels_wiki_and_template = {}

for label in all_labels_wiki_text_embs:
    all_labels_wiki_and_template[label] = torch.cat([all_labels_wiki_text_embs[label], all_labels_text[label][0]])
    

In [None]:
import random 

upsampled4_images = np.zeros((100000, 1024))
upsampled4_texts = np.zeros((100000, 1024))

for i in tqdm(range(100000)):
    label = random.choice(range(1000))
    
    idx = random.choice(indices_by_label[label])
    upsampled4_images[i] = final_images[idx].cpu()
    idx_2 = random.choice(range(len(all_labels_wiki_and_template[label])))
    upsampled4_texts[i] = all_labels_wiki_and_template[label][idx_2].cpu()

In [None]:
# upsampled4_balanced_proj, _, _, _ = lstsq(upsampled4_texts, upsampled4_images)
# upsampled4_balanced_proj.shape

In [None]:
# np.save("imagenet_text2img_upsampled4_balanced_proj.npy", 
#         upsampled4_balanced_proj.astype(float))



In [None]:
import random 

upsampled4_images = np.zeros((400000, 1024))
upsampled4_texts = np.zeros((400000, 1024))

for i in tqdm(range(400000)):
    label = random.choice(range(1000))
    
    idx = random.choice(indices_by_label[label])
    upsampled4_images[i] = final_images[idx].cpu()
    idx_2 = random.choice(range(len(all_labels_wiki_and_template[label])))
    upsampled4_texts[i] = all_labels_wiki_and_template[label][idx_2].cpu()
    
# upsampled4_balanced_proj, _, _, _ = lstsq(upsampled4_texts, upsampled4_images)
# upsampled4_balanced_proj.shape

# np.save("imagenet_text2img_upsampled4_balanced_proj.npy", 
#         upsampled4_balanced_proj.astype(float))


In [None]:
import random 

upsampled4_images = np.zeros((800000, 1024))
upsampled4_texts = np.zeros((800000, 1024))

for i in tqdm(range(800000)):
    label = random.choice(range(1000))
    
    idx = random.choice(indices_by_label[label])
    upsampled4_images[i] = final_images[idx].cpu()
    idx_2 = random.choice(range(len(all_labels_wiki_and_template[label])))
    upsampled4_texts[i] = all_labels_wiki_and_template[label][idx_2].cpu()
# upsampled4_balanced_proj, _, _, _ = lstsq(upsampled4_texts, upsampled4_images)
# upsampled4_balanced_proj.shape

# np.save("imagenet_text2img_upsampled4_balanced_proj800.npy", 
#         upsampled4_balanced_proj.astype(float))


Project to a combination of both text and image

In [28]:
import random 

upsampled5_images = np.zeros((800000, 1024))
upsampled5_texts = np.zeros((800000, 1024))

for i in tqdm(range(800000)):
    label = random.choice(range(1000))
    
    idx_2 = random.choice(range(len(all_labels_wiki_and_template[label])))
    upsampled5_texts[i] = all_labels_wiki_and_template[label][idx_2].cpu()
    
    cointoss = random.choice(range(2))
    
    if cointoss == 0:
        idx = random.choice(indices_by_label[label])
        upsampled5_images[i] = final_images[idx].cpu()
    else:
        idx = random.choice(range(len(all_labels_wiki_and_template[label])))
        upsampled5_images[i] = all_labels_wiki_and_template[label][idx].cpu()

  0%|          | 0/800000 [00:00<?, ?it/s]

KeyError: 994

In [25]:
upsampled5_balanced_proj, _, _, _ = lstsq(upsampled5_texts, upsampled5_images)
upsampled5_balanced_proj.shape

(1024, 1024)

In [26]:
np.save("imagenet_text2img_upsampled5_balanced_proj.npy", upsampled5_balanced_proj.astype(float))

Using Cupl texts

In [4]:
import json
import torch
cupl_texts = json.load(open("/nethome/bdevnani3/flash1/long_tail_lang/results_sklearn/cupl_image_prompts.json"))
final_images = torch.load("clip_embedded_images.pt")
final_texts = torch.load("clip_embedded_texts.pt")
final_labels = torch.load("clip_embedded_image_labels.pt")

RuntimeError: CUDA out of memory. Tried to allocate 454.00 MiB (GPU 0; 10.76 GiB total capacity; 452.52 MiB already allocated; 83.56 MiB free; 454.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [3]:
label_frequencies = {}
for label in final_labels:
    if label.item() not in label_frequencies:
        label_frequencies[label.item()] = 0
    label_frequencies[label.item()] += 1

indices_by_label = {}

for i, label in enumerate(final_labels):
    if label.item() not in indices_by_label:
        indices_by_label[label.item()] = []
    indices_by_label[label.item()].append(i)

NameError: name 'final_labels' is not defined

In [11]:
cupl_embeds = {}
with torch.no_grad():
    for key in tqdm(cupl_texts):
        txts = clip.tokenize(cupl_texts[key])
        txts = txts.cuda()
        zeroshot_weights = text_model(txts).float()
        zeroshot_weights = zeroshot_weights / zeroshot_weights.norm(
            dim=-1, keepdim=True
        )
        cupl_embeds[key] = zeroshot_weights

  0%|          | 0/998 [00:00<?, ?it/s]

In [15]:
import random 

images = np.zeros((800000, 1024))
texts = np.zeros((800000, 1024))

for i in tqdm(range(800000)):
    label = random.choice(range(1000))
    label_name = CLASSES[label]
    
    txt = random.choice(cupl_embeds[label_name])
    texts[i] = txt.cpu().detach().numpy()
    
#     idx = random.choice(indices_by_label[label])
#     images[i] = final_images[idx].cpu()
    
    cointoss = random.choice(range(2))
    
    if cointoss == 0:
        idx = random.choice(indices_by_label[label])
        images[i] = final_images[idx].cpu()
    else:
        txt = random.choice(cupl_embeds[label_name])
        images[i] = txt.cpu().detach().numpy()


  0%|          | 0/800000 [00:00<?, ?it/s]

In [16]:
proj, _, _, _ = lstsq(images, texts)
proj.shape
np.save("/nethome/bdevnani3/flash1/long_tail_lang/proj_matrices/cupl_l2_proj_mix.npy", 
        proj.astype(float))



Analysis of content of the projection matrices

In [None]:
upsampled2_balanced_proj

In [None]:
eigs = np.linalg.eig(upsampled2_balanced_proj)

In [None]:
w, v = eigs

In [None]:
for i in w:
    print(round(i,5))

In [None]:
print(min(w), max(w), np.std(w))

In [None]:
np.mean(w)

In [None]:
final_images.shape

In [None]:
final_texts.shape

In [None]:
final_labels

In [48]:
projected_texts = np.matmul(final_texts.cpu(), upsampled2_balanced_proj)

In [49]:
np.linalg.norm(projected_texts)

268.5127477473582

In [50]:
np.linalg.norm(final_images.cpu())

339.27646

In [51]:
np.linalg.norm(final_texts.cpu())

339.25623

In [207]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0,verbose=1, n_jobs=-1).fit(projected_texts, final_labels)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      1025000     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  8.00236D+05    |proj g|=  1.16415D+03


 This problem is unconstrained.



At iterate   50    f=  2.66757D+05    |proj g|=  5.40155D+01

At iterate  100    f=  2.66095D+05    |proj g|=  1.68676D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****    100    106      1     0     0   1.687D+00   2.661D+05
  F =   266095.48310057109     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 36.5min finished


In [208]:
clf.score(projected_texts, final_labels)

0.894748200196813

In [221]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0,verbose=1, n_jobs=-1).fit(final_texts.cpu(), final_labels)
clf.score(projected_texts, final_labels)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =      1025000     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  8.00236D+05    |proj g|=  1.16415D+03


 This problem is unconstrained.



At iterate   50    f=  1.31944D+05    |proj g|=  4.53975D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****     97    100      1     0     0   3.988D-02   1.319D+05
  F =   131928.92824115991     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 49.7min finished


0.7241423959394369

L1 proj matrix

In [4]:
import numpy as np
from scipy.optimize import minimize

def fit(X, params):
    params = params.reshape((3,3))
    return np.matmul(X,params)

def cost_function(params, X, y):
    # L1 
    return np.sum(np.abs(y - fit(X, params)))

# Sanity 
images = np.array([[1,1,1], [0,0,0]])
text = np.array([[2,2,2], [0,0,0]])
x0 = np.random.rand(3,3)

output = minimize(cost_function, x0, args=(text, images))

proj = fit(text, output.x)
proj

array([[0.99999999, 0.99999999, 0.99999998],
       [0.        , 0.        , 0.        ]])

Project to a combination of both text and image - l1

In [4]:
# import random 

# images = np.zeros((400000, 1024))
# texts = np.zeros((400000, 1024))

# for i in tqdm(range(400000)):
#     label = random.choice(range(1000))
    
#     idx_2 = random.choice(range(len(all_labels_wiki_and_template[label])))
#     texts[i] = all_labels_wiki_and_template[label][idx_2].cpu()
    
#     cointoss = random.choice(range(2))
    
#     if cointoss == 0:
#         idx = random.choice(indices_by_label[label])
#         images[i] = final_images[idx].cpu()
#     else:
#         idx = random.choice(range(len(all_labels_wiki_and_template[label])))
#         images[i] = all_labels_wiki_and_template[label][idx].cpu()
    

In [6]:
images = np.load("proj_matrices/l1-var1_images.npy")

texts = np.load("proj_matrices/l1-var1_texts.npy")

In [8]:
import numpy as np
from scipy.optimize import minimize

def fit(X, params):
    params = params.reshape((1024,1024))
    return np.matmul(X,params)

def cost_function(params, X, y):
    # L1 
    return np.sum(np.abs(y - fit(X, params)))

In [39]:
import numpy as np
import importlib
importlib.reload(np)

x0 = np.random.rand(1024,1024)

output = minimize(cost_function, x0, args=(texts, images), method="CG")
l1_proj_1 = output.x
l1_proj_1.shape

np.save("proj_matrices/l1-var1.npy", 
        l1_proj_1.astype(float))

  _bootstrap._exec(spec, module)


MemoryError: Unable to allocate 8.00 TiB for an array with shape (1048576, 1048576) and data type float64

Similarity matrix

In [38]:
import numpy as np
from scipy.optimize import minimize
from scipy import spatial


def fit(X, params):
    params = params.reshape((3,3))
    return np.matmul(X,params)

def cost_function(params, X, y):
    # cosine
    return np.sum(1- sklearn.metrics.pairwise.cosine_similarity(y, x).diagonal())


# Sanity 
images = np.array([[1,1,1], [0,0,0]])
text = np.array([[2,2,2], [0,0,0]])
x0 = np.random.rand(3,3)

output = minimize(cost_function, x0, args=(text, images))

proj = fit(text, output.x)
proj

array([[3.92599493, 2.57051579, 2.60768697],
       [0.        , 0.        , 0.        ]])

In [40]:
np.linalg.norm(images)

1.7320508075688772