In [None]:
import os

import pandas as pd
import numpy as np

from sentence_transformers import InputExample, SentenceTransformer, datasets, losses, models

In [None]:
# disable logging through weights and biases
os.environ["WANDB_MODE"] = "disabled"

In [None]:
# load the queries generated by T5 model for our abstracts
df = pd.read_csv("ml_research_assistant/data/generated_queries_for_abstracts.tsv",
                 sep="\t", header=None).rename(columns={0: 'query', 1: 'abstract'})
df

Unnamed: 0,query,abstract
0,what is sams vae,Generative models of observations under interv...
1,what is sparse additive mechanism,Generative models of observations under interv...
2,what is sparse additive mechanism shift variab...,Generative models of observations under interv...
3,what is cec in deepmind,"We present a new algorithm, Cross-Episodic Cur..."
4,what is cec in deepmind,"We present a new algorithm, Cross-Episodic Cur..."
...,...,...
76378,when do minkowski metric back propagation need...,Many connectionist learning models are impleme...
76379,what is minkowski propagation,Many connectionist learning models are impleme...
76380,what is the delta rule in neural networks,We investigate the behavior of different learn...
76381,how do neural networks learn,We investigate the behavior of different learn...


In [None]:
print(df.loc[3000, 'query'])
print('\n')
print(df.loc[3000, 'abstract'])

how is the llm score correlated with the user evaluation?


Existing automatic evaluation on text-to-image synthesis can only provide an image-text matching score, without considering the object-level compositionality, which results in poor correlation with human judgments. In this work, we propose LLMScore, a new framework that offers evaluation scores with multi-granularity compositionality. LLMScore leverages the large language models (LLMs) to evaluate text-to-image models. Initially, it transforms the image into image-level and object-level visual descriptions. Then an evaluation instruction is fed into the LLMs to measure the alignment between the synthesized image and the text, ultimately generating a score accompanied by a rationale. Our substantial analysis reveals the highest correlation of LLMScore with human judgments on a wide range of datasets (Attribute Binding Contrast, Concept Conjunction, MSCOCO, DrawBench, PaintSkills). Notably, our LLMScore achieves Kendall's tau co

In [None]:
train_examples = []
with open("ml_research_assistant/data/generated_queries_for_abstracts.tsv") as fIn:
    for line in fIn:
        try:
            query, paragraph = line.strip().split("\t", maxsplit=1)
            train_examples.append(InputExample(texts=[query, paragraph]))
        except:
            pass

In [None]:
train_examples[0]

<sentence_transformers.readers.InputExample.InputExample at 0x7dd3cf659990>

In [None]:
len(train_examples)

60858

In [None]:
60858/3

20286.0

In [None]:
# MultipleNegativesRankingLoss requires that data should not be duplicate
train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=16)

In [None]:
len(train_dataloader)

3803

In [None]:
3803*16

60848

In [None]:
# loading an encoding model, to be finetuned using generated queries
word_emb = models.Transformer("distilbert-base-uncased")
pooling = models.Pooling(word_emb.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_emb, pooling])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
model.device

device(type='cuda', index=0)

In [None]:
# MultipleNegativesRankingLoss requires input pairs in the form (query, relevant_passage)
train_loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
# tuning the model
num_epochs = 1
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    show_progress_bar=True,
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss
500,0.7309
1000,0.2576
1500,0.2224
2000,0.1833
2500,0.1739
3000,0.1674
3500,0.161


In [None]:
os.makedirs("ml_research_assistant/models", exist_ok=True)

In [None]:
model.save("ml_research_assistant/models/finetuned_encoder")