# Visuosyntactic Analyses

In [1]:
%env CORENLP_HOME=stanford-corenlp-4.5.4

env: CORENLP_HOME=stanford-corenlp-4.5.4


In [2]:
from stanza.server import CoreNLPClient
import stanza

stanza.install_corenlp(dir='stanford-corenlp-4.5.4')
client = CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse','coref'], timeout=30000, memory='6G')

2023-07-07 00:32:43 INFO: Writing properties to tmp file: corenlp_server-8e8c4b3e34ad4e6a.props


# Generate DAAM Maps

In [7]:
from pathlib import Path
import json

annotations = json.load(Path('coco/annotations/captions_val2014.json').open())

In [11]:
annotations.keys()

dict_keys(['info', 'images', 'licenses', 'annotations'])

In [13]:
import pandas as pd

df = pd.DataFrame(annotations['annotations'])

In [15]:
!mkdir -p experiments/visuosyntax

In [17]:
df = df.sample(1500, replace=False)

In [22]:
import torch

torch.cuda.amp.autocast().__enter__()
torch.set_grad_enabled(False);

In [None]:
from diffusers import StableDiffusionPipeline
from daam import set_seed, trace

pipe = StableDiffusionPipeline.from_pretrained('stabilityai/stable-diffusion-2-1-base')

In [37]:
pipe.to('cuda:0');

In [None]:
from tqdm import tqdm

for _, row in tqdm(df.iterrows(), total=len(df)):
    image_id, caption = row.image_id, row.caption
    gen = set_seed(image_id)
    output_folder = Path('experiments/visuosyntax')
    
    with trace(pipe) as tc:
        out = pipe(caption, num_inference_steps=30, generator=gen)
        exp = tc.to_experiment(output_folder, id=str(image_id), seed=image_id)
        exp.save(output_folder, heat_maps=False)

# Parse and Analyze

In [None]:
from matplotlib import pyplot as plt
from daam import GenerationExperiment

def iou(a, b, t: float = 0.15) -> float:
    i = ((a > t) & (b > t)).float().sum()
    u = ((a > t) | (b > t)).float().sum()
    
    if u < 1e-6:
        return 0.0
    else:
        return (i / u).item()

def ioa(a, b, t: float = 0.15) -> float:
    i = ((a > t) & (b > t)).float().sum()
    a = (a > t).float().sum()
    
    if a < 1e-6:
        return 0.0
    else:
        return (i / a).item()

stats = []

for path in tqdm(list(Path('experiments/visuosyntax').iterdir())):
    exp = GenerationExperiment.load(path)
    sent = client.annotate(exp.prompt).sentence[0]
    heat_map = exp.heat_map()    
    word_maps = dict()
    
    for tok in sent.token:
        try:
            word_maps[tok.word] = heat_map.compute_word_heat_map(tok.word).value.cuda()
        except ValueError:
            pass        
    
    for edge in sent.enhancedDependencies.edge:
        head = sent.token[edge.source - 1].word
        rel = edge.dep
        dep = sent.token[edge.target - 1].word
        
        try:
            head_heat_map = word_maps[head]
            dep_heat_map = word_maps[dep]
        except KeyError:
            continue
        
        stats.append(dict(
            rel=rel,
            iou=iou(head_heat_map, dep_heat_map),
            iod=ioa(dep_heat_map, head_heat_map),
            ioh=ioa(head_heat_map, dep_heat_map)
        ))

# Results

In [149]:
stats_df = pd.DataFrame(stats)
res_df = stats_df.groupby('rel').agg(count=('rel', len), mIoU=('iou', 'mean'), mIoD=('iod', 'mean'), mIoH=('ioh', 'mean'))
res_df = res_df.sort_values('count', ascending=False).iloc[:10]
res_df['delta'] = (res_df['mIoH'] - res_df['mIoD']).abs()

In [150]:
res_df.drop(columns=['count'], inplace=True)
res_df = res_df.transform(lambda x: x * 100)
res_df.sort_values('delta')

Unnamed: 0_level_0,mIoU,mIoD,mIoH,delta
rel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
punct,0.099857,2.44841,0.103295,2.345114
nmod:of,8.657074,12.855358,21.987856,9.132498
compound,33.434113,59.130795,49.98517,9.145626
nsubj,5.027227,10.692133,22.710293,12.01816
case,3.831952,18.088006,5.895829,12.192177
det,0.447811,13.012975,0.657808,12.355168
conj:and,28.435928,55.501867,39.649883,15.851984
acl,6.452009,28.692415,11.101184,17.591231
obj,6.641952,10.566673,36.442496,25.875823
amod,14.690878,45.06272,19.05172,26.011
