# **Track 2 - Continuous Representation**

In [None]:
## IMPORTS
from sklearn.preprocessing import normalize
import warnings
warnings.filterwarnings('ignore')
from gensim.models import FastText
from sklearn.neighbors import NearestNeighbors
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
import numpy as np
from tqdm import tqdm

## **FastText**

This is the code that should be used to reproduce the results.

In [2]:
## LOAD THE DATA
dev_df = pd.read_csv('dev_responses.csv')
train_df = pd.read_csv('train_responses.csv')

## REMOVE INVALID RESPONSES
train_df = train_df[
    train_df['model_response'].astype(str).str.strip().replace(r'^\W*$', '', regex=True) != ''
].reset_index(drop=True)

## TOKENIZE THE DATA
train_tokens = [str(p).split() for p in train_df['user_prompt']]
dev_tokens = [str(p).split() for p in dev_df['user_prompt']]


In [3]:
## BLEU SCORE CALCULATION
smoother = SmoothingFunction().method3
def evaluate_bleu(dev_df, retrieved_responses):
    scores = []
    for i in range(len(dev_df)):
        ref = str(dev_df.loc[i, 'model_response']).split()
        hyp = str(retrieved_responses[i]).split()
        score = sentence_bleu([ref], hyp, weights=(0.5, 0.5, 0, 0), smoothing_function=smoother)
        scores.append(score)
    return np.mean(scores)

In [7]:
## FASTTEXT WITH OPTIMAL PARAMETERS
model = FastText(
        sentences=train_tokens,
        vector_size=100,
        window=10,
        min_count=1,
        workers=4,
        epochs=100,
        sg=1
    )

In [10]:
## CONVERT TOKENIZED TEXT INTO VECTOR EMBEDDINGS
def get_vec(tokens):
    ''' 
    For each token (word) in tokens, if the word exists in the FastText vocabulary (model.wv), get its vector.
    Then, if any valid vectors were found, return their mean (i.e. sentence embedding). 
    Otherwise, return a zero vector.
    '''
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

train_vecs = np.vstack([get_vec(t) for t in train_tokens])
dev_vecs = np.vstack([get_vec(t) for t in dev_tokens])

## NORMALIZE
train_vecs = normalize(train_vecs)
dev_vecs = normalize(dev_vecs)

In [11]:
## FIND MOST SIMILAR PROMPT (using Nearest Neighbors with cosine similarity)
nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='cosine')
nn.fit(train_vecs)
_, idxs = nn.kneighbors(dev_vecs)

retrieved_responses = [train_df.iloc[i]['model_response'] for i in idxs.flatten()]
bleu = evaluate_bleu(dev_df, retrieved_responses)

print(f"Average BLEU: {bleu:.4f}")

Average BLEU: 0.0866


### **Hyperparameter Search**

In [None]:
# define hyperparameter grid
param_grid = [
    {'vector_size': v, 'window': w, 'epochs': e, 'sg': s}
    for v in [100, 300, 512]
    for w in [3, 5, 10]
    for e in [10, 30, 50]
    for s in [0, 1]
]

best_bleu = -1
best_params = None

for params in tqdm(param_grid, desc="Tuning"):
    model = FastText(
        sentences=train_tokens,
        vector_size=params['vector_size'],
        window=params['window'],
        min_count=1,
        workers=4,
        epochs=params['epochs'],
        sg=params['sg']
    )

    def get_vec(tokens):
        vecs = [model.wv[word] for word in tokens if word in model.wv]
        return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

    train_vecs = np.vstack([get_vec(t) for t in train_tokens])
    dev_vecs = np.vstack([get_vec(t) for t in dev_tokens])
    
    train_vecs = normalize(train_vecs)
    dev_vecs = normalize(dev_vecs)

    nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='cosine')
    nn.fit(train_vecs)
    _, idxs = nn.kneighbors(dev_vecs)

    retrieved_responses = [train_df.iloc[i]['model_response'] for i in idxs.flatten()]
    bleu = evaluate_bleu(dev_df, retrieved_responses)

    print(f"Params: {params}, BLEU: {bleu:.4f}")
    if bleu > best_bleu:
        best_bleu = bleu
        best_params = params

print(f"\nBest BLEU: {best_bleu:.4f} with params: {best_params}")


Trying 54 combinations...



Tuning:   2%|▏         | 1/54 [00:12<11:04, 12.55s/it]

Params: {'vector_size': 100, 'window': 3, 'epochs': 10, 'sg': 0}, BLEU: 0.0577


Tuning:   4%|▎         | 2/54 [00:25<11:01, 12.73s/it]

Params: {'vector_size': 100, 'window': 3, 'epochs': 10, 'sg': 1}, BLEU: 0.0696


Tuning:   6%|▌         | 3/54 [00:41<12:05, 14.23s/it]

Params: {'vector_size': 100, 'window': 3, 'epochs': 30, 'sg': 0}, BLEU: 0.0660


Tuning:   7%|▋         | 4/54 [00:59<13:13, 15.87s/it]

Params: {'vector_size': 100, 'window': 3, 'epochs': 30, 'sg': 1}, BLEU: 0.0783


Tuning:   9%|▉         | 5/54 [01:19<14:08, 17.32s/it]

Params: {'vector_size': 100, 'window': 3, 'epochs': 50, 'sg': 0}, BLEU: 0.0705


Tuning:  11%|█         | 6/54 [01:45<16:05, 20.12s/it]

Params: {'vector_size': 100, 'window': 3, 'epochs': 50, 'sg': 1}, BLEU: 0.0822


Tuning:  13%|█▎        | 7/54 [01:57<13:44, 17.55s/it]

Params: {'vector_size': 100, 'window': 5, 'epochs': 10, 'sg': 0}, BLEU: 0.0573


Tuning:  15%|█▍        | 8/54 [02:10<12:27, 16.25s/it]

Params: {'vector_size': 100, 'window': 5, 'epochs': 10, 'sg': 1}, BLEU: 0.0724


Tuning:  17%|█▋        | 9/54 [02:27<12:21, 16.49s/it]

Params: {'vector_size': 100, 'window': 5, 'epochs': 30, 'sg': 0}, BLEU: 0.0669


Tuning:  19%|█▊        | 10/54 [02:48<12:58, 17.69s/it]

Params: {'vector_size': 100, 'window': 5, 'epochs': 30, 'sg': 1}, BLEU: 0.0809


Tuning:  20%|██        | 11/54 [03:09<13:28, 18.80s/it]

Params: {'vector_size': 100, 'window': 5, 'epochs': 50, 'sg': 0}, BLEU: 0.0701


Tuning:  22%|██▏       | 12/54 [03:36<14:56, 21.34s/it]

Params: {'vector_size': 100, 'window': 5, 'epochs': 50, 'sg': 1}, BLEU: 0.0848


Tuning:  24%|██▍       | 13/54 [03:49<12:48, 18.74s/it]

Params: {'vector_size': 100, 'window': 10, 'epochs': 10, 'sg': 0}, BLEU: 0.0564


Tuning:  26%|██▌       | 14/54 [04:04<11:39, 17.48s/it]

Params: {'vector_size': 100, 'window': 10, 'epochs': 10, 'sg': 1}, BLEU: 0.0739


Tuning:  28%|██▊       | 15/54 [04:22<11:29, 17.68s/it]

Params: {'vector_size': 100, 'window': 10, 'epochs': 30, 'sg': 0}, BLEU: 0.0673


Tuning:  30%|██▉       | 16/54 [04:45<12:18, 19.44s/it]

Params: {'vector_size': 100, 'window': 10, 'epochs': 30, 'sg': 1}, BLEU: 0.0832


Tuning:  31%|███▏      | 17/54 [05:09<12:49, 20.81s/it]

Params: {'vector_size': 100, 'window': 10, 'epochs': 50, 'sg': 0}, BLEU: 0.0710


Tuning:  33%|███▎      | 18/54 [05:42<14:42, 24.52s/it]

Params: {'vector_size': 100, 'window': 10, 'epochs': 50, 'sg': 1}, BLEU: 0.0863


Tuning:  35%|███▌      | 19/54 [05:59<12:53, 22.11s/it]

Params: {'vector_size': 300, 'window': 3, 'epochs': 10, 'sg': 0}, BLEU: 0.0571


Tuning:  37%|███▋      | 20/54 [06:17<11:50, 20.90s/it]

Params: {'vector_size': 300, 'window': 3, 'epochs': 10, 'sg': 1}, BLEU: 0.0688


Tuning:  39%|███▉      | 21/54 [06:41<12:03, 21.91s/it]

Params: {'vector_size': 300, 'window': 3, 'epochs': 30, 'sg': 0}, BLEU: 0.0659


Tuning:  41%|████      | 22/54 [07:11<12:58, 24.32s/it]

Params: {'vector_size': 300, 'window': 3, 'epochs': 30, 'sg': 1}, BLEU: 0.0781


Tuning:  43%|████▎     | 23/54 [07:43<13:44, 26.59s/it]

Params: {'vector_size': 300, 'window': 3, 'epochs': 50, 'sg': 0}, BLEU: 0.0695


Tuning:  44%|████▍     | 24/54 [08:23<15:12, 30.43s/it]

Params: {'vector_size': 300, 'window': 3, 'epochs': 50, 'sg': 1}, BLEU: 0.0815


Tuning:  46%|████▋     | 25/54 [08:39<12:44, 26.36s/it]

Params: {'vector_size': 300, 'window': 5, 'epochs': 10, 'sg': 0}, BLEU: 0.0552


Tuning:  48%|████▊     | 26/54 [08:59<11:22, 24.36s/it]

Params: {'vector_size': 300, 'window': 5, 'epochs': 10, 'sg': 1}, BLEU: 0.0729


Tuning:  50%|█████     | 27/54 [09:26<11:15, 25.01s/it]

Params: {'vector_size': 300, 'window': 5, 'epochs': 30, 'sg': 0}, BLEU: 0.0663


Tuning:  52%|█████▏    | 28/54 [09:58<11:51, 27.35s/it]

Params: {'vector_size': 300, 'window': 5, 'epochs': 30, 'sg': 1}, BLEU: 0.0810


Tuning:  54%|█████▎    | 29/54 [10:34<12:23, 29.73s/it]

Params: {'vector_size': 300, 'window': 5, 'epochs': 50, 'sg': 0}, BLEU: 0.0698


Tuning:  56%|█████▌    | 30/54 [11:20<13:50, 34.62s/it]

Params: {'vector_size': 300, 'window': 5, 'epochs': 50, 'sg': 1}, BLEU: 0.0838


Tuning:  57%|█████▋    | 31/54 [11:38<11:21, 29.62s/it]

Params: {'vector_size': 300, 'window': 10, 'epochs': 10, 'sg': 0}, BLEU: 0.0561


Tuning:  59%|█████▉    | 32/54 [11:59<09:56, 27.11s/it]

Params: {'vector_size': 300, 'window': 10, 'epochs': 10, 'sg': 1}, BLEU: 0.0741


Tuning:  61%|██████    | 33/54 [12:28<09:44, 27.83s/it]

Params: {'vector_size': 300, 'window': 10, 'epochs': 30, 'sg': 0}, BLEU: 0.0669


Tuning:  63%|██████▎   | 34/54 [13:08<10:24, 31.22s/it]

Params: {'vector_size': 300, 'window': 10, 'epochs': 30, 'sg': 1}, BLEU: 0.0831


Tuning:  65%|██████▍   | 35/54 [13:49<10:50, 34.25s/it]

Params: {'vector_size': 300, 'window': 10, 'epochs': 50, 'sg': 0}, BLEU: 0.0706


Tuning:  67%|██████▋   | 36/54 [14:46<12:21, 41.20s/it]

Params: {'vector_size': 300, 'window': 10, 'epochs': 50, 'sg': 1}, BLEU: 0.0855


Tuning:  69%|██████▊   | 37/54 [15:11<10:13, 36.11s/it]

Params: {'vector_size': 512, 'window': 3, 'epochs': 10, 'sg': 0}, BLEU: 0.0562


Tuning:  70%|███████   | 38/54 [15:37<08:50, 33.18s/it]

Params: {'vector_size': 512, 'window': 3, 'epochs': 10, 'sg': 1}, BLEU: 0.0688


Tuning:  72%|███████▏  | 39/54 [16:11<08:21, 33.41s/it]

Params: {'vector_size': 512, 'window': 3, 'epochs': 30, 'sg': 0}, BLEU: 0.0655


Tuning:  74%|███████▍  | 40/54 [16:52<08:19, 35.69s/it]

Params: {'vector_size': 512, 'window': 3, 'epochs': 30, 'sg': 1}, BLEU: 0.0785


Tuning:  76%|███████▌  | 41/54 [17:37<08:20, 38.52s/it]

Params: {'vector_size': 512, 'window': 3, 'epochs': 50, 'sg': 0}, BLEU: 0.0699


Tuning:  78%|███████▊  | 42/54 [18:32<08:40, 43.40s/it]

Params: {'vector_size': 512, 'window': 3, 'epochs': 50, 'sg': 1}, BLEU: 0.0821


Tuning:  80%|███████▉  | 43/54 [18:53<06:44, 36.82s/it]

Params: {'vector_size': 512, 'window': 5, 'epochs': 10, 'sg': 0}, BLEU: 0.0552


Tuning:  81%|████████▏ | 44/54 [19:17<05:30, 33.02s/it]

Params: {'vector_size': 512, 'window': 5, 'epochs': 10, 'sg': 1}, BLEU: 0.0721


Tuning:  83%|████████▎ | 45/54 [19:52<05:00, 33.43s/it]

Params: {'vector_size': 512, 'window': 5, 'epochs': 30, 'sg': 0}, BLEU: 0.0661


Tuning:  85%|████████▌ | 46/54 [20:35<04:50, 36.29s/it]

Params: {'vector_size': 512, 'window': 5, 'epochs': 30, 'sg': 1}, BLEU: 0.0802


Tuning:  87%|████████▋ | 47/54 [21:22<04:37, 39.68s/it]

Params: {'vector_size': 512, 'window': 5, 'epochs': 50, 'sg': 0}, BLEU: 0.0703


Tuning:  89%|████████▉ | 48/54 [22:24<04:38, 46.34s/it]

Params: {'vector_size': 512, 'window': 5, 'epochs': 50, 'sg': 1}, BLEU: 0.0845


Tuning:  91%|█████████ | 49/54 [22:47<03:16, 39.34s/it]

Params: {'vector_size': 512, 'window': 10, 'epochs': 10, 'sg': 0}, BLEU: 0.0542


Tuning:  93%|█████████▎| 50/54 [23:14<02:22, 35.70s/it]

Params: {'vector_size': 512, 'window': 10, 'epochs': 10, 'sg': 1}, BLEU: 0.0741


Tuning:  94%|█████████▍| 51/54 [23:54<01:50, 36.87s/it]

Params: {'vector_size': 512, 'window': 10, 'epochs': 30, 'sg': 0}, BLEU: 0.0667


Tuning:  96%|█████████▋| 52/54 [24:47<01:23, 41.60s/it]

Params: {'vector_size': 512, 'window': 10, 'epochs': 30, 'sg': 1}, BLEU: 0.0827


Tuning:  98%|█████████▊| 53/54 [25:43<00:46, 46.01s/it]

Params: {'vector_size': 512, 'window': 10, 'epochs': 50, 'sg': 0}, BLEU: 0.0706


Tuning: 100%|██████████| 54/54 [27:02<00:00, 30.05s/it]

Params: {'vector_size': 512, 'window': 10, 'epochs': 50, 'sg': 1}, BLEU: 0.0855

Best BLEU: 0.0863 with params: {'vector_size': 100, 'window': 10, 'epochs': 50, 'sg': 1}





Note that in the section before we increase the number of epochs to 100 since it improves the result.

### **Create Submission CSV**

In [None]:
def generate_track2_submission(train_df, dev_df, test_df, output_file='track_2_test.csv'):
    combined_df = pd.concat([train_df, dev_df], ignore_index=True)

    combined_df = combined_df[
        combined_df['model_response'].astype(str).str.strip().replace(r'^\W*$', '', regex=True) != ''
    ].reset_index(drop=True)

    combined_tokens = [str(p).split() for p in combined_df['user_prompt']]
    test_tokens = [str(p).split() for p in test_df['user_prompt']]

    model = FastText(
        sentences=combined_tokens,
        vector_size=100,
        window=10,
        min_count=1,
        workers=4,
        epochs=100,
        sg=1
    )

    def get_vec(tokens):
        vecs = [model.wv[word] for word in tokens if word in model.wv]
        return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

    combined_vecs = np.vstack([get_vec(t) for t in combined_tokens])
    test_vecs = np.vstack([get_vec(t) for t in test_tokens])

    combined_vecs = normalize(combined_vecs)
    test_vecs = normalize(test_vecs)

    nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='cosine')
    nn.fit(combined_vecs)
    _, idxs = nn.kneighbors(test_vecs)

    matched_ids = [combined_df.iloc[i]['conversation_id'] for i in idxs.flatten()]
    result_df = pd.DataFrame({
        'conversation_id': test_df['conversation_id'],
        'response_id': matched_ids
    })

    result_df.to_csv(output_file, index=False)
    print(f"Saved Track 2 submission to: {output_file}")

In [13]:
test_df = pd.read_csv('test_prompts.csv')
generate_track2_submission(train_df, dev_df, test_df)

Saved Track 2 submission to: track_2_test.csv
