In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import torch
import torch.nn.functional as F

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
train = pd.read_csv("/kaggle/input/similarity-oracle/train.csv")
test = pd.read_csv("/kaggle/input/similarity-oracle/test.csv")

In [4]:
train.head(3)

Unnamed: 0,sampleID,sentence1,sentence2,score
0,0,A plane is taking off.,An air plane is taking off.,5.0
1,1,A man is playing a large flute.,A man is playing a flute.,3.8
2,2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8


In [19]:
from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

input_texts = ['This is the first sentence', 'This is the second sentence']

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B').to(device)

In [6]:
max_length = 64

batch_dict = tokenizer(
    input_texts,
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt",
)
batch_dict.to(model.device)
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

embeddings = F.normalize(embeddings, p=2, dim=1)

embeddings

tensor([[-0.0119, -0.0206, -0.0077,  ...,  0.0247,  0.0343, -0.0224],
        [-0.0165, -0.0414, -0.0063,  ..., -0.0131,  0.0120,  0.0037]],
       device='cuda:0', grad_fn=<DivBackward0>)

In [7]:
def get_embeddings(df, col, batch_size=8, max_length=64):
    embeddings = []
    for i in tqdm(range(0, len(df), batch_size)):
        sentences = df[col][i:i+batch_size].tolist()
        batch_dict = tokenizer(
            sentences,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )
        batch_dict.to(model.device)
        outputs = model(**batch_dict)
        emb = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        emb = F.normalize(emb, p=2, dim=1).detach().cpu().numpy()
        embeddings.append(emb)
    
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

In [8]:
train_sentence1_embeddings = get_embeddings(train, 'sentence1')
train_sentence2_embeddings = get_embeddings(train, 'sentence2')

test_sentence1_embeddings = get_embeddings(test, 'sentence1')
test_sentence2_embeddings = get_embeddings(test, 'sentence2')

  0%|          | 0/719 [00:00<?, ?it/s]

  0%|          | 0/719 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

  0%|          | 0/173 [00:00<?, ?it/s]

In [10]:
train_embeddings = np.concatenate([train_sentence1_embeddings, train_sentence2_embeddings], axis=1)
test_embeddings = np.concatenate([test_sentence1_embeddings, test_sentence2_embeddings], axis=1)

In [21]:
del tokenizer
del model

In [30]:
torch.cuda.empty_cache()

In [33]:
from sklearn.model_selection import train_test_split
from catboost import Pool

X, y = train_embeddings, train['score']
X_test = test_embeddings

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.1)

train_pool = Pool(X_train, y_train)
valid_pool = Pool(X_valid, y_valid)

In [39]:
from catboost import CatBoostRegressor

params = {
    'iterations': 1000,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 50,
    'learning_rate': 0.1,
    'random_state': 42,
    'max_depth': 6,
    'task_type': 'GPU'
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 1.2270951	test: 1.2352013	best: 1.2352013 (0)	total: 75.2ms	remaining: 1m 15s
50:	learn: 1.0891774	test: 1.1318440	best: 1.1318440 (50)	total: 3.62s	remaining: 1m 7s
100:	learn: 0.9633646	test: 1.0416457	best: 1.0416457 (100)	total: 7.15s	remaining: 1m 3s
150:	learn: 0.8466159	test: 0.9486419	best: 0.9486419 (150)	total: 10.6s	remaining: 59.8s
200:	learn: 0.7522589	test: 0.8862949	best: 0.8862949 (200)	total: 14.1s	remaining: 56.1s
250:	learn: 0.6680531	test: 0.8258209	best: 0.8258209 (250)	total: 17.7s	remaining: 52.7s
300:	learn: 0.5982564	test: 0.7806654	best: 0.7806654 (300)	total: 21.2s	remaining: 49.2s
350:	learn: 0.5407147	test: 0.7474864	best: 0.7474864 (350)	total: 24.7s	remaining: 45.7s
400:	learn: 0.4962377	test: 0.7231906	best: 0.7231906 (400)	total: 28.2s	remaining: 42.2s
450:	learn: 0.4583854	test: 0.7060123	best: 0.7060123 (450)	total: 31.8s	remaining: 38.7s
500:	learn: 0.4261434	test: 0.6904708	best: 0.6904708 (500)	total: 35.4s	remaining: 35.2s
550:	learn: 0.

<catboost.core.CatBoostRegressor at 0x7ea0442b14d0>

In [64]:
y_pred = model.predict(test_embeddings)

subm = []

for i in range(len(test)):
    for sid in range(1, 5):
        answer = y_pred[i]
        avg_length = (len(test['sentence1'][i]) + len(test['sentence2'][i])) / 2
        total_words = len(test['sentence1'][i].split()) + len(test['sentence2'][i].split())
        length_diff = abs(len(test['sentence1'][i]) - len(test['sentence2'][i]))
        if sid==1:
            if avg_length < 50:
                answer = 'Short'
            elif 50 <= avg_length < 100:
                answer = 'Medium'
            else:
                answer = 'Long'
        elif sid==2:
            answer = total_words
        elif sid==3:
            answer = length_diff
            
        subm.append({
            'subtaskID': sid,
            'datapointID': test['sampleID'][i],
            'answer': answer
        })

subm = pd.DataFrame(subm)

subm

Unnamed: 0,subtaskID,datapointID,answer
0,1,5749,Short
1,2,5749,12
2,3,5749,1
3,4,5749,2.838601
4,1,5750,Short
...,...,...,...
5511,4,7126,1.166062
5512,1,7127,Short
5513,2,7127,15
5514,3,7127,15


In [65]:
subm.to_csv("submission.csv", index=False)