# **Track 3 - Bonus**

In [None]:
## IMPORTS
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [None]:
## LOAD THE DATA
dev_df = pd.read_csv('dev_responses.csv')
train_df = pd.read_csv('train_responses.csv')

## REMOVE INVALID RESPONSES
train_df = train_df[
    train_df['model_response'].astype(str).str.strip().replace(r'^\W*$', '', regex=True) != ''
].reset_index(drop=True)

## **Sentence Transformer**

In [14]:
model = SentenceTransformer('all-mpnet-base-v2') # this has proved to be the best model for this task

In [15]:
def get_vec(text):
    return model.encode(text)

In [None]:
train_vecs = np.vstack([get_vec(p) for p in train_df['user_prompt']])
dev_vecs = np.vstack([get_vec(p) for p in dev_df['user_prompt']])

train_vecs = normalize(train_vecs)
dev_vecs = normalize(dev_vecs)

In [None]:
## COMPUTE BLEU SCORE
smoother = SmoothingFunction().method3
def evaluate_bleu(dev_df, retrieved_responses):
    scores = []
    for i in range(len(dev_df)):
        ref = str(dev_df.loc[i, 'model_response']).split()
        hyp = str(retrieved_responses[i]).split()
        score = sentence_bleu([ref], hyp, weights=(0.5, 0.5, 0, 0), smoothing_function=smoother)
        scores.append(score)
    return np.mean(scores)

In [None]:
## NEAREST NEIGHBORS
nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='cosine')
nn.fit(train_vecs)
_, idxs = nn.kneighbors(dev_vecs)

In [None]:
retrieved_responses = [train_df.iloc[i]['model_response'] for i in idxs.flatten()]

bleu = evaluate_bleu(dev_df, retrieved_responses)
print(f"BLEU: {bleu:.4f}")

BLEU: 0.1080


I tried also other models, which gave the following results:
- all-MiniLM-L6-v2 gave BLEU 0.1024 
- all-MiniLM-L12-v2 gave BLEU: 0.1048 
- paraphrase-MiniLM-L6-v2 gave BLEU: 0.0967 
- paraphrase-mpnet-base-v2 gave BLEU: 0.1025


### **Create Submission CSV**

In [11]:
def generate_track3_submission(train_df, dev_df, test_df, output_file='track_3_test.csv'):
    combined_df = pd.concat([train_df, dev_df], ignore_index=True)
    combined_df = combined_df[
        combined_df['model_response'].astype(str).str.strip().replace(r'^\W*$', '', regex=True) != ''
    ].reset_index(drop=True)

    model = SentenceTransformer('all-mpnet-base-v2')  

    combined_vecs = model.encode(combined_df['user_prompt'].tolist(), normalize_embeddings=True)
    test_vecs = model.encode(test_df['user_prompt'].tolist(), normalize_embeddings=True)

    nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='cosine')
    nn.fit(combined_vecs)
    _, idxs = nn.kneighbors(test_vecs)

    matched_ids = [combined_df.iloc[i]['conversation_id'] for i in idxs.flatten()]
    result_df = pd.DataFrame({
        'conversation_id': test_df['conversation_id'],
        'response_id': matched_ids
    })

    result_df.to_csv(output_file, index=False)
    print(f"Saved Track 2 submission to: {output_file}")


In [12]:
test_df = pd.read_csv('test_prompts.csv')
generate_track3_submission(train_df, dev_df, test_df)

Saved Track 2 submission to: track_3_test.csv
