In [1]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import torch
import os, glob
import matplotlib.pyplot as plt
from collections import defaultdict
import pandas as pd
import re
from tqdm.notebook import tqdm
import torch.nn.functional as F

## model loading

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:0


In [3]:
# Load the CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [4]:
# Load an image from a URL
image_path = os.path.join('samples/demo_houses/A_105/A_105_1.jpg')
image = Image.open(image_path)
# plt.imshow(image)

In [7]:
# Process the image and text
inputs = processor(text=["この写真はルームです"], images=image, return_tensors="pt", padding=True).to(device)
# Forward pass through CLIP
outputs = model(**inputs)

## testing output 

In [8]:
# Extract image and text features
 # ---> see here: https://huggingface.co/transformers/v4.8.0/model_doc/clip.html#transformers.CLIPTextModel
# features are after projection layers.
# cls are before projection layers
image_features = outputs.image_embeds
text_features = outputs.text_embeds
vision_model_cls = outputs.vision_model_output.pooler_output
text_model_cls = outputs.text_model_output.pooler_output

In [9]:
# check this for dimension: https://huggingface.co/transformers/v4.8.0/model_doc/clip.html#clipvisionconfig
print("Image Features:", image_features.shape) 
print("Text Features:", text_features.shape)
print('vision_model_output', vision_model_cls.shape)
print('text_model_output', text_model_cls.shape)

Image Features: torch.Size([1, 512])
Text Features: torch.Size([1, 512])
vision_model_output torch.Size([1, 768])
text_model_output torch.Size([1, 512])


## image and text data loading + processing

In [10]:
# load df containing text info
df_house_text = pd.read_csv('samples/demo_houses/scrapy_info_all_text.csv')
df_house_text[['house_code', 'text']]

Unnamed: 0,house_code,text
0,A_0,美しく、暮らしやすい上質なデザインを重ねた家。「妻と娘のために家をつくりたい」という思いを胸...
1,A_1,子育てのためのアイデアがたくさんウッディ＆ナチュラルな二世帯住宅。豊かな自然に囲まれた爽やか...
2,A_2,趣味空間の設置や便利な家事動線など３世帯が楽しく快適に暮らせる住まい。Ｓさまご夫妻に、Ｓさま...
3,A_3,約40帖の大空間と借景を活かす大開口二世帯６人が暮らす快適な平屋の住まい。東日本大震災で家が...
4,A_4,「愛着のある土地で暮らしたい」と三世帯が同居する店舗併用住宅を建築。築50年ほどの家の建て替...
...,...,...
517,F_50,木のぬくもり感があふれる居心地のいい住まい。内と外が緩やかにつながります。板張りの天井や吹抜...
518,F_51,ゆとりある上質空間で家族が安心して暮らせる家。子ども達ものびのび遊んでいます。ゆったりとくつ...
519,F_52,開放感あふれる平屋。木のぬくもりが、家族の時間を心地よく包みます。吹抜けの開放感が心地いいリ...
520,F_53,家族や友人との時間を楽しむ住まい。土間やテラスもお気に入りです。テラスや庭へとつながる開放感...


In [9]:
# use image path to find the text info
house_image_dict = defaultdict(list) # dict key is house code, value is image path as a list
house_text_dict = defaultdict(list) # dict is house code, value is a text list contains text scrapy from info sheet
for path in glob.glob('samples/demo_houses/*'):
    if os.path.isdir(path):
        house_code = os.path.basename(path)
        text =  df_house_text[df_house_text.house_code==house_code].text.iloc[0]
        split_text = re.split(r'[。\、]', text)
        house_text_dict[house_code] = split_text
        # print(house_text_dict[house_code])
        house_image_dict[house_code] = (glob.glob(os.path.join(path, '*.jpg')))
print(house_image_dict['A_105'])
print(house_text_dict['A_105'])

['samples/demo_houses/A_105/A_105_0.jpg', 'samples/demo_houses/A_105/A_105_1.jpg', 'samples/demo_houses/A_105/A_105_2.jpg', 'samples/demo_houses/A_105/A_105_3.jpg', 'samples/demo_houses/A_105/A_105_4.jpg']
['プライバシーを大切にした縦割り型二世帯住宅', 'Ｍさま＆Ａさま邸は', '完全分離型の二世帯住宅', 'ご両親のMさまが住む実家を建て替えて', '娘さまご家族が同居されました', '縦割りのプランなので', '両世帯の生活時間やスタイルが違っても', 'お互いの生活音が気にならず', '気兼ねなく暮らせます', '「ときどきワインを手に親世帯を訪ね', '父の手料理を楽しんでいます」と幸せそうに語る娘さまご夫妻です', 'デザインタイルの壁をアクセントに生かした子世帯のモダンなLDK', '白をベースにした明るくシンプルモダンな住空間を', '光沢を放つ濃い色のフローリングで引き締めた子世帯のLDK', '壁掛けテレビの左右に配したモダンなダークグレーのデザインタイル壁がインテリアを引き立てています', 'キッチンは前のカウンターを高くして', 'リビングから内部が見えないようにしています', '天然石調のアクセント壁が映える玄関ホール', '大理石調フロアも収納扉も「白」ですっきりと美しくコーディネートした子世帯の玄関ホール', '天然石調の調湿壁材を貼ったグレーのアクセント壁が', '照明に照らされて気品を放っています', 'リビングにしつらえた小上がりスペースはお子さま達の遊び場', '子世帯のLDKの一角には', '4畳半大の小上がりスペースをしつらえました', 'タイルカーペットは汚れが目立たず', 'お子さまの遊び場にぴったり', '「カウンターにお雛様や五月人形を飾れるようになったのもうれしいですね」とAさま', '広々とした子ども部屋は将来2部屋にできるプラン', '小さいうちは', 'みんなで伸びやかに遊べるよう', '子ども部屋は広々としたワンルームに', '将来は２部屋に間仕切りできるよう', 'あらかじめドア', '照明', '収

In [10]:
 house_text_dict['A_105'][0:5]

['プライバシーを大切にした縦割り型二世帯住宅',
 'Ｍさま＆Ａさま邸は',
 '完全分離型の二世帯住宅',
 'ご両親のMさまが住む実家を建て替えて',
 '娘さまご家族が同居されました']

In [None]:
# save text for each houses 


##  extraction function def

In [35]:
# todo: here every time we loop for one image and one sentence of text, which is slow. in future we can do batch processing
# set model to eval
# inputs = processor(text=["この写真はルームです"], images=image, return_tensors="pt", padding=True).to(device)
# outputs = model(**inputs)
# vision_model_cls = outputs.vision_model_output.pooler_output
# text_model_cls = outputs.text_model_output.pooler_output
model.eval()

# model = torch.compile(model)
# get function return embedding after the internal projections
def extract_vision_text_embeds(image_lst, text_lst, do_pooling=False, max_length=77):
    # text
    text_features = []
    for text in text_lst:
        text_inputs = processor(text=text, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
        # print(text, len(text_inputs.input_ids[0]))
        with torch.no_grad():
            # feature_per_text = model.get_text_features(**text_inputs)
            feature_per_text = model.text_model(**text_inputs).pooler_output
        text_features.append(feature_per_text)
    if do_pooling:
        text_features = torch.concatenate(text_features, axis=0).mean(dim=0, keepdim=False)
    # image part
    image_features = []
    for image in image_lst:
        image = Image.open(image)
        image_inputs = processor(images=image, return_tensors="pt", padding=True).to(device)
        with torch.no_grad():
            # feature_per_image = model.get_image_features(**image_inputs)
            feature_per_image = model.vision_model(**image_inputs).pooler_output
        image_features.append(feature_per_image)
    if do_pooling:
        image_features = torch.concatenate(image_features, axis=0).mean(dim=0, keepdim=True) # keep batch_size dim
    torch.cuda.empty_cache()
    return image_features, text_features
# print(image_features.shape, text_features.shape)

In [38]:
image_feature, text_feature = extract_vision_text_embeds(house_image_dict['A_105'], house_text_dict['A_105'])
print(house_image_dict['A_105'])
print(image_feature[0].shape, text_feature[0].shape)

['samples/demo_houses/A_105/A_105_0.jpg', 'samples/demo_houses/A_105/A_105_1.jpg', 'samples/demo_houses/A_105/A_105_2.jpg', 'samples/demo_houses/A_105/A_105_3.jpg', 'samples/demo_houses/A_105/A_105_4.jpg']
torch.Size([1, 768]) torch.Size([1, 512])


## actual feature extraction

In [40]:
save_dir = 'features_from_samples'
do_pooling = False
for key in tqdm(house_image_dict.keys()): 
    # key is house id, like A_XX
    if key not in house_text_dict.keys():
        print(f'key {key} found in image folder but not find in text info')
    else:
        image_feature_lst, text_feature_lst = extract_vision_text_embeds(house_image_dict[key], house_text_dict[key], do_pooling=do_pooling)
        if do_pooling:
            torch.save(image_feature_lst, os.path.join(save_dir, 'image_feature', f'image_embeds_{key}_all.pt'))
            torch.save(text_feature_lst, os.path.join(save_dir, 'text_feature', f'text_embeds_{key}_all.pt'))
        else:   
            for i, image_feature in enumerate(image_feature_lst):
                torch.save(image_feature, os.path.join(save_dir, 'image_feature', f'image_embeds_{key}_{i}.pt'))
            for i, text_feature in enumerate(text_feature_lst):
                torch.save(text_feature, os.path.join(save_dir, 'text_feature', f'text_embeds_{key}_{i}.pt'))

  0%|          | 0/38 [00:00<?, ?it/s]

## support set feature extraction

In [44]:
N = 3
cold_modern_lst = [f'rawdata/download/japanese_cold_modern_{i}.jpg' for i in range(0, N)]
warm_wooden_lst = [f'rawdata/download/japanese_warm_wooden_{i}.jpg' for i in range(0, N)]

for i, image in enumerate(cold_modern_lst):
    image_feature, text_feature = extract_vision_text_embeds([image], ['cold modern room'], do_pooling=False)
    torch.save(image_feature[0], os.path.join(save_dir, 'support_set', f'image_embeds_cold_modern_{i}.pt'))
    torch.save(text_feature[0], os.path.join(save_dir, 'support_set', f'text_embeds_cold_modern_{i}.pt'))
for i, image in enumerate(warm_wooden_lst):
    image_feature, text_feature = extract_vision_text_embeds([image], ['warm wooden room'], do_pooling=False)
    torch.save(image_feature[0], os.path.join(save_dir, 'support_set', f'image_embeds_warm_wooden_{i}.pt'))
    torch.save(text_feature[0], os.path.join(save_dir, 'support_set', f'text_embeds_warm_wooden_{i}.pt'))

## load back tensors to check

In [None]:
key = 'A_105'
image_embed = torch.load(os.path.join(save_dir, 'image_feature', f'image_embeds_{key}.pt'))
text_embed =  torch.load(os.path.join(save_dir, 'text_feature', f'text_embeds_{key}.pt'))
torch.set_printoptions(linewidth=200)

In [21]:
print(image_embed)
print(image_embed.shape)

tensor([-3.7337e-01,  1.9003e-01, -6.9516e-02, -1.1457e-02,  1.9513e-01, -1.3157e-02,  1.7577e-01,  3.9134e-02,  4.2398e-01,  9.9946e-02,  3.9106e-03,  6.3825e-02,  4.3441e-01, -3.2335e-01,
         2.6085e-01, -3.7312e-01, -8.3070e-01,  1.6355e-01,  9.0442e-02,  6.0854e-02,  4.1752e-01,  1.9539e-01,  1.2026e-01,  6.9418e-01,  6.0679e-02,  4.3729e-01,  2.6683e-01, -3.5112e-01,
         2.7913e-02,  1.1667e-01,  2.8116e-01,  5.0979e-01, -5.0666e-02, -1.3630e-01, -6.8704e-02,  3.6805e-01, -1.0557e-01,  1.0120e-01,  2.9587e-01,  8.5541e-01, -1.6069e-01, -9.1950e-02,
         3.1497e-01, -4.3999e-01, -1.2191e-01, -1.3662e+00, -2.5494e-01,  3.6645e-01, -7.0054e-03,  2.7787e-01,  9.2837e-02, -9.0491e-02,  3.3077e-01, -2.9311e-02, -1.7219e-01, -2.4799e-01,
        -8.5293e-02, -2.2850e-01,  2.5233e-01,  1.2796e-01, -1.0885e+00,  7.8988e-02,  6.1063e-03, -1.7262e-01, -3.3622e-01, -6.8049e-02,  6.2376e-01,  1.6483e-01,  1.3143e-01,  9.4645e-02,
        -1.4793e-01,  1.2977e-01, -5.9127e-02,  2.

In [22]:
print(text_embed)
print(text_embed.shape)

tensor([-2.7031e-01, -6.1202e-02,  3.1098e-02, -1.5160e-01,  2.5652e-01, -2.3826e-01,  2.8217e-02,  6.1855e-01, -5.3759e-02, -7.9562e-02, -2.6925e-01, -4.8184e-02,  1.2985e-02, -1.9597e-02,
         2.7128e-01,  3.1973e-01, -1.3195e-01,  1.0630e-01,  1.7645e-01, -9.3786e-02, -7.2736e-01,  2.1005e-01, -2.4777e-02, -1.3696e-01, -2.3984e-01, -2.2531e-01, -2.1538e-01,  2.3163e-01,
         5.3167e-02,  1.8508e-01, -3.0434e-01, -1.5426e-01,  1.8085e-01,  1.1121e-02,  2.6503e-01,  3.6078e-02, -1.5393e-01, -4.4108e-02,  8.9453e-02,  1.9732e-01, -2.3671e-01,  9.7554e-02,
         1.8903e-01,  3.3739e-01, -1.0120e-02,  7.8481e-02, -8.1103e-02,  4.5570e-02, -3.4252e-02,  1.6429e-01,  3.0462e-01,  1.3976e-01,  2.9961e-02, -6.2817e-03,  1.6870e-01, -1.8154e-01,
         3.6406e-01,  3.4184e-01,  8.1078e-02, -1.6566e-01,  4.3402e-01,  1.8811e-02, -1.7189e-01, -3.1717e-01,  1.2121e-02,  2.0171e-01,  1.0304e-01,  4.1010e-01,  3.1884e-01, -3.4952e-02,
        -1.6302e-01, -3.7488e-02, -1.5225e-01,  3.

In [25]:
cosine_sim = F.cosine_similarity(image_embed, text_embed, dim=0)
cosine_sim

tensor(0.2222, device='cuda:0')