In [16]:
import pandas as pd
from Functions.PreprocessingFunctions import lemmatization_and_stemming, TFIDF_discretizer, remove_punctuation, remove_stopwords, tokenize, apply_BOW, apply_cosine_similarity, apply_TFIDF, apply_fasttext_model_multiple
from Functions.TestingFunctions import cosine_similarity_score, BOW_exhaustive_search, TFIDF_exhaustive_search, get_track1_predictions, get_track2_predictions, get_track3_predictions
from nltk.corpus import stopwords
import numpy as np
import re
from sentence_transformers import SentenceTransformer
import tqdm
from Functions.PreprocessingFunctions import apply_word2vec_model_multiple
from gensim.models import KeyedVectors

# Data Description

In [4]:
train_df = pd.read_csv('data/train_responses.csv')
dev_df = pd.read_csv('data/dev_responses.csv')
test_df = pd.read_csv('data/test_prompts.csv')

In [5]:
dev_nan_indices = list(dev_df[dev_df.isna().any(axis=1)].index)
train_nan_indices = list(train_df[train_df.isna().any(axis=1)].index)

print("Indices of dev rows with NaN values:")
print(dev_nan_indices)

print("Indices of train rows with NaN values:")
print(train_nan_indices)


Indices of dev rows with NaN values:
[945, 4279]
Indices of train rows with NaN values:
[]


In [6]:
train_df = train_df.dropna().reset_index(drop = True)
dev_df = dev_df.dropna().reset_index(drop = True)

train_and_dev_df = pd.concat([train_df, dev_df], axis=0)
train_and_dev_df.reset_index(drop=True, inplace=True)

In [228]:
train_df.head()

Unnamed: 0,conversation_id,user_prompt,model_response
0,a5411f5a5d6d4cf39b688424b941ff91,Whats the derivative of cos(2x)?,The derivative of cos(2x) is -sin(2x). This is...
1,0f84903b99184bf0a620ac7acfb721a0,can you suggest synonyms for the word diffusion?,"Sure, here are some synonyms for the word ""dif..."
2,4c864c0ac5f74f048b0533d39d622d2d,do you know what Pentax camera is good for?,Pentax is a well-known brand of cameras and le...
3,ad091c8d81254b86b12d051d082333fa,what is FastChat-T5? what's different between ...,FastChat-T5 is a research project developed by...
4,c83dfb6138f24faa961ce0de988a4f25,do you know difference between uploading exten...,"Yes, there are some differences between upload..."


In [229]:
dev_df.head()

Unnamed: 0,conversation_id,user_prompt,model_response
0,c0617586e4da4f96be269b9042de02cd,let back(arg) = gra. what is back(the)?,"I'm sorry, I'm not sure what you mean by ""back..."
1,a60aa04b99834b1d9dcc107526cf499d,puedes resumir un documento pdf?,"Sí, puedo resumir el contenido de un documento..."
2,634847d24efe474fa649197c605ed089,Who s the best team in League of Legends ?,It's difficult to say who the best team in Lea...
3,98d39fd5bd2a4b8aa11f434ca64b02de,hello. do you think the nasdaq index will be u...,"Hello! As an AI language model, I do not have ..."
4,e5f979964a1245e8bc742b693d9bc59b,russia-ukraine crisis: why russia attcks in 2022?,The Russia-Ukraine crisis is a complex and ong...


In [230]:
test_df.head()

Unnamed: 0,conversation_id,user_prompt
0,0cf125095fa74e129f9b7b6054d2993e,"Regarding Stable Diffusion models, should I us..."
1,e6296e2a7a554a3db3152704d065498e,I am thinking of a childrens cartoon where the...
2,ee22ccf57c064f5f955f1fd2f9ed5e90,what's difference between ioremap and devm_ior...
3,f5ef6be6d11746e39ec404496c307ab8,Who is a Multiplan direct competitor in US?
4,1fcea667861046d1834b17e7851dcca4,Can you give me ideas for a dnd oneshot?


As we can notice, training and dev matrices are composed of 3 columns:
- The first column contains a unique **conversation_id** written in hexadecimal form.
- The second column contains the questions that users asked in the past (**user_prompt**)
- The third column contains the desired response of the model
<br>
The objective of this project is to create a chatbot that, when prompted with a question from the dev dataframe, finds the question in the training set of highest semantic similarity, and returns its response. The quality of the response will be evaluated using the BLEU score (implemented in the **TestingFunctions.py** file)

# Data Preprocessing

As a first preprocessing step, I put all words in lowercase and tokenize them in such a way that also parentheses, equal signs and other symbols are separated from the words, in order to increase the quality of the lemmatization and stemming performed later.

In [7]:
tokenized_train = tokenize(train_df['user_prompt'].values)
tokenized_dev = tokenize(dev_df['user_prompt'].values)
tokenized_test = tokenize(test_df['user_prompt'].values)
tokenized_train_and_dev = tokenize(train_and_dev_df['user_prompt'].values)

In [4]:
print(tokenized_train[0])
print(tokenized_train[1])
print(tokenized_train[2])
print(tokenized_train[3])
print(tokenized_train[4])

['whats', 'the', 'derivative', 'of', 'cos', '(', '2x', ')', '?']
['can', 'you', 'suggest', 'synonyms', 'for', 'the', 'word', 'diffusion', '?']
['do', 'you', 'know', 'what', 'pentax', 'camera', 'is', 'good', 'for', '?']
['what', 'is', 'fastchat', '-', 't5', '?', 'what', "'", 's', 'different', 'between', 'fastchat', '-', 't5', 'and', 'vicuna', '-', '13b', '?']
['do', 'you', 'know', 'difference', 'between', 'uploading', 'extension', 'to', 'chrome', 'store', '&', 'edge', 'store', '?']


In [5]:
print(tokenized_dev[0])
print(tokenized_dev[1])
print(tokenized_dev[2])
print(tokenized_dev[3])
print(tokenized_dev[4])

['let', 'back', '(', 'arg', ')', '=', 'gra', '.', 'what', 'is', 'back', '(', 'the', ')', '?']
['puedes', 'resumir', 'un', 'documento', 'pdf', '?']
['who', 's', 'the', 'best', 'team', 'in', 'league', 'of', 'legends', '?']
['hello', '.', 'do', 'you', 'think', 'the', 'nasdaq', 'index', 'will', 'be', 'up', 'or', 'down', 'today', '?']
['russia', '-', 'ukraine', 'crisis', ':', 'why', 'russia', 'attcks', 'in', '2022', '?']


In [6]:
print(tokenized_test[0])
print(tokenized_test[1])
print(tokenized_test[2])
print(tokenized_test[3])
print(tokenized_test[4])

['regarding', 'stable', 'diffusion', 'models', ',', 'should', 'i', 'use', 'fp16', 'or', 'fp32', '?']
['i', 'am', 'thinking', 'of', 'a', 'childrens', 'cartoon', 'where', 'the', 'main', 'characters', 'initials', 'are', 'as', 'follows', ':', 'ts', 'rd', 'pp', 'aj', 'r', 'f', 'what', 'is', 'the', 'show', '?']
['what', "'", 's', 'difference', 'between', 'ioremap', 'and', 'devm_ioremap', '?']
['who', 'is', 'a', 'multiplan', 'direct', 'competitor', 'in', 'us', '?']
['can', 'you', 'give', 'me', 'ideas', 'for', 'a', 'dnd', 'oneshot', '?']


In the next cell, I explore different preprocessing methods, including:
- punctuation removal
- punctuation and stopwords removal
- lemmatization
- stemming
- lemmatization and punctuation removal
- stemming and punctuation removal
- lemmatization and punctuation and stopwords removal
- stemming and punctuation and stopwords removal

In [237]:
stop_words = set(stopwords.words('english'))

# Punctuation Removal
np_basic_train = remove_punctuation(tokenized_train)
np_basic_dev = remove_punctuation(tokenized_dev)
cleaned_basic_train = remove_stopwords(np_basic_train, stop_words)
cleaned_basic_dev = remove_stopwords(np_basic_dev, stop_words)

# Data stemmed and lemmatized #
lemmatized_train, stemmed_train = lemmatization_and_stemming([' '.join(sentence) for sentence in tokenized_train])
lemmatized_dev, stemmed_dev = lemmatization_and_stemming([' '.join(sentence) for sentence in tokenized_dev])

# Data stemmed and lemmatized without punctuation #
np_lemmatized_train = remove_punctuation(lemmatized_train)
np_stemmed_train = remove_punctuation(stemmed_train)
np_lemmatized_dev = remove_punctuation(lemmatized_dev)
np_stemmed_dev = remove_punctuation(stemmed_dev)

# Data stemmed and lemmatized without punctuation and stopwords #

cleaned_lemmatized_train = remove_stopwords(np_lemmatized_train, stop_words)
cleaned_stemmed_train = remove_stopwords(np_stemmed_train, stop_words)
cleaned_lemmatized_dev = remove_stopwords(np_lemmatized_dev, stop_words)
cleaned_stemmed_dev = remove_stopwords(np_stemmed_dev, stop_words)

### Visualization of the Preprocessing Results

In the following cell I collect all the different train/dev pairs in a useful dict for easy access

In [238]:
data_list = {'basic': [tokenized_train, tokenized_dev],
             'np_basic': [np_basic_train, np_basic_dev],
             'sw_basic': [cleaned_basic_train, cleaned_basic_dev],
             'lemmatized': [lemmatized_train, lemmatized_dev],
             'stemmed': [stemmed_train, stemmed_dev],
             'np_lemmatized': [np_lemmatized_train, np_lemmatized_dev],
             'np_stemmed': [np_stemmed_train, np_stemmed_dev],
             'sw_lemmatized': [cleaned_lemmatized_train, cleaned_lemmatized_dev],
             'sw_stemmed': [cleaned_stemmed_train, cleaned_stemmed_dev]
}

In [239]:
print(f'{"basic non tokenized":<20}: {train_df["user_prompt"].values[0]}')
for key in data_list:
    print(f'{key:<20}: {" ".join(data_list[key][0][0])}')

basic non tokenized : Whats the derivative of cos(2x)?
basic               : whats the derivative of cos ( 2x ) ?
np_basic            : whats the derivative of cos 2x
sw_basic            : whats derivative cos 2x
lemmatized          : what s the derivative of cos ( 2x ) ?
stemmed             : what s the deriv of cos ( 2x ) ?
np_lemmatized       : what s the derivative of cos 2x
np_stemmed          : what s the deriv of cos 2x
sw_lemmatized       : derivative cos 2x
sw_stemmed          : deriv cos 2x


# Discrete Representations

## BOW

The first discrete representation method I will use is Bag of Words (BOW). 

In [240]:
BOW_data_list = apply_BOW(data_list)

In [241]:
for key in BOW_data_list:
    print(f'Shape of Train in {key:<15}: {BOW_data_list[key][0].shape}')

Shape of Train in basic          : (14250, 16185)
Shape of Train in np_basic       : (14250, 16185)
Shape of Train in sw_basic       : (14250, 16164)
Shape of Train in lemmatized     : (14250, 13521)
Shape of Train in stemmed        : (14250, 12167)
Shape of Train in np_lemmatized  : (14250, 13521)
Shape of Train in np_stemmed     : (14250, 12167)
Shape of Train in sw_lemmatized  : (14250, 13503)
Shape of Train in sw_stemmed     : (14250, 12149)


As we can notice, the BOW vectors have many dimensions, but lemmatizing and removing stopwords can somewhat mitigate this issue.

### TESTING

In [242]:
BOW_results = apply_cosine_similarity(BOW_data_list, train_df, dev_df)

In [243]:
for key in dict(sorted(BOW_results.items(), key=lambda x: x[1][1], reverse=True)):
    print(f'BLEU Score of {key:<{15}}: {BOW_results[key][1]:.4f}')

BLEU Score of basic          : 0.0785
BLEU Score of np_basic       : 0.0785
BLEU Score of stemmed        : 0.0784
BLEU Score of np_stemmed     : 0.0784
BLEU Score of sw_basic       : 0.0783
BLEU Score of sw_stemmed     : 0.0782
BLEU Score of lemmatized     : 0.0769
BLEU Score of np_lemmatized  : 0.0769
BLEU Score of sw_lemmatized  : 0.0766


From what we can see above, the model that performs best with BOW is the one where we do not lemmatize nor stem nor remove stopwords (punctuation is removed by default by CountVectorizer)

### EXHAUSTIVE SEARCH

In this section, I will try to optimize the parameters of the BOW function on the model that performed best in the previous step, which is the basic model.

In [None]:
n_gram_parameters = [(i, j) for i in range(1, 8) for j in range(i, 8)]
binary = [False, True]
analyzer = ['char']
results_basic = BOW_exhaustive_search(tokenized_train, tokenized_dev, n_gram_parameters, analyzer, binary, train_df, dev_df)

In [245]:
for key in results_basic:
    print(f'Optimal {key:<10}: {results_basic[key]}')

Optimal n_gram    : (2, 5)
Optimal analyzer  : char
Optimal binary    : True
Optimal score     : 0.08956738592670373


By choosing the above parameters in the BOW function, the BLEU score of the model goes up to 0.089, which is pretty good considering how the BLEU score is structured.

## TFIDF

Given that in the previous model the basic models (i.e. without punctuation and stopword removal) performed better than the other model, I will restrict my attention to them.

In [246]:
TFIDF_data = apply_TFIDF(data_list)

In [247]:
TFIDF_scores = apply_cosine_similarity(TFIDF_data, train_df, dev_df)

In [248]:
for key in dict(sorted(TFIDF_scores.items(), key=lambda x: x[1][1], reverse=True)):
    print(f'BLEU Score of {key:<{15}}: {TFIDF_scores[key][1]:.4f}')

BLEU Score of stemmed        : 0.0809
BLEU Score of np_stemmed     : 0.0809
BLEU Score of basic          : 0.0808
BLEU Score of np_basic       : 0.0808
BLEU Score of sw_stemmed     : 0.0808
BLEU Score of sw_basic       : 0.0804
BLEU Score of lemmatized     : 0.0793
BLEU Score of np_lemmatized  : 0.0793
BLEU Score of sw_lemmatized  : 0.0790


Contrary to the BOW Model, where the basic model would perform better, in this case the stemmed model performs slightly better.

In [249]:
n_gram_parameters = [(i, j) for i in range(1, 8) for j in range(i, 8)]
analyzer = ['char']
binary = [True]
smooth = [True]

In [None]:
results_basic = TFIDF_exhaustive_search(tokenized_train, tokenized_dev, n_gram_parameters, train_df, dev_df)
results_stem = TFIDF_exhaustive_search(stemmed_train, stemmed_dev, n_gram_parameters, train_df, dev_df)

In [251]:
for key in results_basic:
    print(f'BASIC Optimal {key:<10}: {results_basic[key]}')
print('')
for key in results_stem:
    print(f'STEM Optimal {key:<10}: {results_stem[key]}')

BASIC Optimal n_gram    : (2, 4)
BASIC Optimal binary    : True
BASIC Optimal smooth    : True
BASIC Optimal analyzer  : char
BASIC Optimal score     : 0.09111316455243584

STEM Optimal n_gram    : (2, 4)
STEM Optimal binary    : True
STEM Optimal smooth    : True
STEM Optimal analyzer  : char
STEM Optimal score     : 0.09070840640157646


By choosing the parameters found above (which are pretty much the same found in the BOW analysis), the score goes up to 0.091.

## Running on TEST Set

Given that the optimal score between train and dev was found using the TFIDF method with the parameters above, I will use them to predict the most similar prompts to the ones in the TEST. 

In [None]:
track1_data = get_track1_predictions(tokenized_train_and_dev, tokenized_test, train_and_dev_df, test_df)

In [9]:
track1_data.head()

Unnamed: 0,test_prompt,training_prompt,retrieved_response,conversation_id,response_id
0,"Regarding Stable Diffusion models, should I us...",What does stable diffusion mean?,Stable diffusion refers to a process in which ...,0cf125095fa74e129f9b7b6054d2993e,5e74391122d64c459554b0a5cebf7027
1,I am thinking of a childrens cartoon where the...,"Traditionally, in a plot arc, what is the poin...",The point where the main character is at their...,e6296e2a7a554a3db3152704d065498e,0bdbc95062f94eb2a0fbc835ef787321
2,what's difference between ioremap and devm_ior...,What's difference between you and chatGPT?,I'm a language model trained by researchers fr...,ee22ccf57c064f5f955f1fd2f9ed5e90,c56f17fa58ed4eafa678ae2aac96de5d
3,Who is a Multiplan direct competitor in US?,what are the best chatgpt competitors?,There are several notable competitors to ChatG...,f5ef6be6d11746e39ec404496c307ab8,ece901472bf84d0cb3b1eeb6b0607b1b
4,Can you give me ideas for a dnd oneshot?,Give me ideas for a mothers day gift?,Here are a few ideas for Mother's Day gifts:\n...,1fcea667861046d1834b17e7851dcca4,cad50072d7874e66b7cc223ba1f91fd8


In [13]:
track1_data[['conversation_id', 'response_id']].to_csv('track1.csv', index=False)

In [14]:
for i in range(3):
    print(f"TEST QUESTION: {track1_data[['test_prompt', 'training_prompt', 'retrieved_response']].iloc[i][0]}")
    print(f"RETRIEVED QUESTION: {track1_data[['test_prompt', 'training_prompt', 'retrieved_response']].iloc[i][1]}")
    print(f"RETRIEVED ANSWER: {track1_data[['test_prompt', 'training_prompt', 'retrieved_response']].iloc[i][2][:200]}...")
    print()

TEST QUESTION: Regarding Stable Diffusion models, should I use fp16 or fp32?
RETRIEVED QUESTION: What does stable diffusion mean?
RETRIEVED ANSWER: Stable diffusion refers to a process in which a substance or particle spreads out evenly across a medium, such as a gas or liquid, over a long period of time. This process is often used to describe th...

TEST QUESTION: I am thinking of a childrens cartoon where the main characters initials are as follows: TS RD PP AJ R F What is the show?
RETRIEVED QUESTION: Traditionally, in a plot arc, what is the point where the main character is at their lowest?
RETRIEVED ANSWER: The point where the main character is at their lowest in a plot arc is often referred to as the "low point" or "all is lost" moment. This is the point in the story where it seems like all hope is lost...

TEST QUESTION: what's difference between ioremap and devm_ioremap?
RETRIEVED QUESTION: What's difference between you and chatGPT?
RETRIEVED ANSWER: I'm a language model train

# Static Representation

In this section, I will try different Static continuous representation methods, to eventually determine the one which works best.

## FastText Pretrained Model

In [276]:
import fasttext.util

In [277]:
ft = fasttext.load_model('extra/cc.en.300.bin')  # THE PATH HAS TO BE SUBTITUTED WITH THE PATH WHERE THE ENGLISH FASTTEXT PRETRAINED MODEL IS STORED



In [278]:
fasttext_data = apply_fasttext_model_multiple(ft, data_list)

In [279]:
fasttext_scores = apply_cosine_similarity(fasttext_data, train_df, dev_df)

In [280]:
for key in dict(sorted(fasttext_scores.items(), key=lambda x: x[1][1], reverse=True)):
    print(f'BLEU Score of {key:<{15}}: {fasttext_scores[key][1]:.4f}')

BLEU Score of sw_basic       : 0.0814
BLEU Score of sw_lemmatized  : 0.0802
BLEU Score of sw_stemmed     : 0.0791
BLEU Score of np_basic       : 0.0723
BLEU Score of np_stemmed     : 0.0721
BLEU Score of np_lemmatized  : 0.0707
BLEU Score of stemmed        : 0.0702
BLEU Score of basic          : 0.0700
BLEU Score of lemmatized     : 0.0696


## Word2Vec Pretrained Model

In [18]:
model_path = '/Users/alessandroardenghi/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz' # TO BE SUBSTITUTED WITH THE PATH TO THE DOWNLOADED WORD2VEC MODEL
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [282]:
word2vec_data = apply_word2vec_model_multiple(model, data_list)

In [283]:
pretrained_word2vec_scores = apply_cosine_similarity(word2vec_data, train_df, dev_df)

In [284]:
for key in dict(sorted(pretrained_word2vec_scores.items(), key=lambda x: x[1][1], reverse=True)):
    print(f'BLEU Score of {key:<{15}}: {pretrained_word2vec_scores[key][1]:.4f}')

BLEU Score of np_basic       : 0.0892
BLEU Score of basic          : 0.0892
BLEU Score of lemmatized     : 0.0875
BLEU Score of sw_basic       : 0.0874
BLEU Score of np_lemmatized  : 0.0873
BLEU Score of sw_lemmatized  : 0.0854
BLEU Score of np_stemmed     : 0.0837
BLEU Score of stemmed        : 0.0834
BLEU Score of sw_stemmed     : 0.0781


The best model among the ones I tried is the pretrained Word2Vec model, and therefore I will use it to perform the prediction on the test set.

## Prediction on TEST set

### Running on TEST Set

In [19]:
track2_data = get_track2_predictions(model, tokenized_train_and_dev, tokenized_test, train_and_dev_df, test_df)

In [21]:
track2_data.head()

Unnamed: 0,test_prompt,training_prompt,retrieved_response,conversation_id,response_id
0,"Regarding Stable Diffusion models, should I us...",How do I use diffusion model implemented by Te...,TensorFlow provides a high-level API for train...,0cf125095fa74e129f9b7b6054d2993e,58c4df897ad840c2abd459e31f814c01
1,I am thinking of a childrens cartoon where the...,.hi.! i calculated mann-whitney W as 47.do you...,"Yes, in R you can use the `pwcorr()` function ...",e6296e2a7a554a3db3152704d065498e,e4dd44b9934e4b3592abef0254496080
2,what's difference between ioremap and devm_ior...,What's the difference between Kennzeichnungspf...,Kennzeichnungspflicht and Mitteilungspflicht a...,ee22ccf57c064f5f955f1fd2f9ed5e90,2ffe05c573d24ac9b634a5511e71285e
3,Who is a Multiplan direct competitor in US?,What is your position in a race if you pass th...,"If you pass the person who is in second place,...",f5ef6be6d11746e39ec404496c307ab8,8a09f42a72c0419e97c0b65851d421f0
4,Can you give me ideas for a dnd oneshot?,can you gave me a game idea for roblox?,Sure! Here's a game idea for Roblox:\n\nTitle:...,1fcea667861046d1834b17e7851dcca4,4aadfb4fba2b44b8b46bac51ba1b5a70


In [22]:
for i in range(3):
    print(f"TEST QUESTION: {track2_data[['test_prompt', 'training_prompt', 'retrieved_response']].iloc[i][0]}")
    print(f"RETRIEVED QUESTION: {track2_data[['test_prompt', 'training_prompt', 'retrieved_response']].iloc[i][1]}")
    print(f"RETRIEVED ANSWER: {track2_data[['test_prompt', 'training_prompt', 'retrieved_response']].iloc[i][2][:200]}...")
    print()

TEST QUESTION: Regarding Stable Diffusion models, should I use fp16 or fp32?
RETRIEVED QUESTION: How do I use diffusion model implemented by TensorFlow?
RETRIEVED ANSWER: TensorFlow provides a high-level API for training and using diffusion models. Here are the basic steps to get started:

1. Import the required modules and classes:
```python
import tensorflow as tf
fr...

TEST QUESTION: I am thinking of a childrens cartoon where the main characters initials are as follows: TS RD PP AJ R F What is the show?
RETRIEVED QUESTION: .hi.! i calculated mann-whitney W as 47.do you know how to get the corresponding p-value in R language.?
RETRIEVED ANSWER: Yes, in R you can use the `pwcorr()` function from the `pwcorr` package to calculate the Mann-Whitney U statistic and the associated p-value.

Here's an example of how you can use it:
```
# install th...

TEST QUESTION: what's difference between ioremap and devm_ioremap?
RETRIEVED QUESTION: What's the difference between Kennzeichnungspflicht 

In [None]:
track2_data[['conversation_id', 'response_id']].to_csv('track2.csv', index=False)

# Advanced Methods

In this section, I try out some more advanced sentence embedding methods, namely Sentence Transformers. Of course, I will not attempt to train one myself, but rather I will use some pretrained models. Given the long time it takes to some of these models to generate the sentence embeddings, I have carried out all the embedding generation and saved it in a directory called sentence_tranformer_data. <br>
I will however leave the code I used to obtain the embeddings and all the models that I used in a commented block, so that my results can be replicated.
The models I tried are:
- all-MiniLM-L6-v2
- multi-qa-mpnet-base-dot-v1
- all-mpnet-base-v2
- gte-small

In [1]:
#from sentence_transformers import SentenceTransformer
# import tqdm

# model1 = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# model2 = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# model3 = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')
# model4 = SentenceTransformer('thenlper/gte-small')


#encoded_queries = []

#for user_prompt in tqdm.tqdm(train_df['user_prompt'].values, desc="Encoding queries", total=len(train_df)):
#    encoded_query = model.encode(user_prompt)  
#    encoded_queries.append(encoded_query)  

#encoded_queries = np.array(encoded_queries)
#file_path = ''

#np.save(file_path, encoded_queries)

#print(f"Encoded queries saved to '{file_path}'")

In [10]:
sentence_embeddings = {'all-MiniLM-L6-v2': [np.load('sentence_transformer_data/allMini_train_queries.npy'), 
                                            np.load('sentence_transformer_data/allMini_dev_queries.npy')],
                       'multi-qa-mpnet-base-dot-v1': [np.load('sentence_transformer_data/multiqa_train_queries.npy'), 
                                                      np.load('sentence_transformer_data/multiqa_dev_queries.npy')],
                       'all-mpnet-base-v2': [np.load('sentence_transformer_data/mpnet_train_queries.npy'),
                                             np.load('sentence_transformer_data/mpnet_dev_queries.npy')],
                       'gte-small': [np.load('sentence_transformer_data/gte_train_queries.npy'),
                                     np.load('sentence_transformer_data/gte_dev_queries.npy')]   
}

In [89]:
scores = apply_cosine_similarity(sentence_embeddings, train_df, dev_df)

In [10]:
for key in dict(sorted(scores.items(), key=lambda x: x[1][1], reverse=True)):
    print(f'BLEU Score of {key:<{30}}: {scores[key][1]:.4f}')

BLEU Score of all-mpnet-base-v2             : 0.1079
BLEU Score of multi-qa-mpnet-base-dot-v1    : 0.1035
BLEU Score of all-MiniLM-L6-v2              : 0.1024
BLEU Score of gte-small                     : 0.1002


I will therefore use the all-mpnet-base-v2 model to perform the prediction on the test set. In the following cells I will get the sentence embeddings of the train_and_dev_df and the test_df, I will save them and then use them to perform the predictions.

In [12]:
encoded_queries = []
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

for user_prompt in tqdm.tqdm(test_df['user_prompt'].values, desc="Encoding test queries", total=len(test_df)):
    encoded_query = model.encode(user_prompt)  
    encoded_queries.append(encoded_query)  

encoded_queries = np.array(encoded_queries)
file_path = 'sentence_transformer_data/FINAL_test_queries.npy'

np.save(file_path, encoded_queries)

print(f"Encoded queries saved to '{file_path}'")

Encoding test queries: 100%|██████████| 5000/5000 [24:31<00:00,  3.40it/s]

Encoded queries saved to 'sentence_transformer_data/FINAL_test_queries.npy'





In [23]:
train_embeddings = np.load('sentence_transformer_data/multiqa_train_queries.npy')
dev_embeddings = np.load('sentence_transformer_data/multiqa_dev_queries.npy')
dev_embeddings = np.delete(dev_embeddings, dev_nan_indices, axis=0) # Removing from the dev_embeddings array the rows with NAN entries
test_embeddings = np.load('sentence_transformer_data/FINAL_test_queries.npy')

train_and_dev_embeddings = np.vstack((train_embeddings, dev_embeddings))

In [24]:
track3_data = get_track3_predictions(train_and_dev_embeddings, test_embeddings, train_and_dev_df, test_df)

In [25]:
track3_data.head()

Unnamed: 0,test_prompt,training_prompt,retrieved_response,conversation_id,response_id
0,"Regarding Stable Diffusion models, should I us...",What’s the best laptop for stable diffusion?,The best laptop for stable diffusion would dep...,0cf125095fa74e129f9b7b6054d2993e,84b2731d868641318191ef9598dc9c57
1,I am thinking of a childrens cartoon where the...,Do you know that game when each letters is rep...,"Yes, I'm familiar with that type of game! It's...",e6296e2a7a554a3db3152704d065498e,6b4fac11954941f8b3f3f434878e639c
2,what's difference between ioremap and devm_ior...,What is the difference between %i and %d in st...,In Python's string formatting using the `%` op...,ee22ccf57c064f5f955f1fd2f9ed5e90,98b8fee9722a403986efc8bf97014ecb
3,Who is a Multiplan direct competitor in US?,what are the top 3 life insurance companies in...,It is difficult to determine the top life insu...,f5ef6be6d11746e39ec404496c307ab8,852dc1bd8e9841bb8b0d388d08d58cb7
4,Can you give me ideas for a dnd oneshot?,Can you give me some creative ideas for a birt...,Here are a few ideas for a birthday present fo...,1fcea667861046d1834b17e7851dcca4,0b6cd1a34e1c41e6895367461b36fa9f


In [26]:
for i in range(3):
    print(f"TEST QUESTION: {track3_data[['test_prompt', 'training_prompt', 'retrieved_response']].iloc[i][0]}")
    print(f"RETRIEVED QUESTION: {track3_data[['test_prompt', 'training_prompt', 'retrieved_response']].iloc[i][1]}")
    print(f"RETRIEVED ANSWER: {track3_data[['test_prompt', 'training_prompt', 'retrieved_response']].iloc[i][2][:200]}...")
    print()

TEST QUESTION: Regarding Stable Diffusion models, should I use fp16 or fp32?
RETRIEVED QUESTION: What’s the best laptop for stable diffusion?
RETRIEVED ANSWER: The best laptop for stable diffusion would depend on your specific needs and budget. However, here are a few factors to consider when choosing a laptop for diffusion:

1. Processor: A powerful process...

TEST QUESTION: I am thinking of a childrens cartoon where the main characters initials are as follows: TS RD PP AJ R F What is the show?
RETRIEVED QUESTION: Do you know that game when each letters is replaced by another and you need to decrypt ?
RETRIEVED ANSWER: Yes, I'm familiar with that type of game! It's called a "substitution cipher," and it's a classic example of a cryptographic technique that has been used throughout history to hide the meaning of a me...

TEST QUESTION: what's difference between ioremap and devm_ioremap?
RETRIEVED QUESTION: What is the difference between %i and %d in string formatting?
RETRIEVED ANSWER

In [26]:
track3_data[['conversation_id', 'response_id']].to_csv('track3.csv', index=False)