In [9]:
import ast
import hashlib
import numpy as np
import pandas as pd
from sentence_transformers import datasets, evaluation, InputExample, losses, models, SentenceTransformer
from sklearn.model_selection import train_test_split
import torch


  from .autonotebook import tqdm as notebook_tqdm


# Create Datasets from Test Data

In [2]:
## Product Data

df_test = pd.read_csv('raw_data/test.csv', encoding = 'ISO-8859-1')
df_solution = pd.read_csv('raw_data\solution.csv', encoding = 'ISO-8859-1')
df = df_test.merge(df_solution, how = 'left', on = 'id')

df_desc = pd.read_csv('raw_data/product_descriptions.csv', encoding = 'ISO-8859-1')
df_attr = pd.read_csv('raw_data/attributes.csv', encoding = 'ISO-8859-1')

In [3]:
# Products Dataset creation
df_prods = df[['product_uid', 'product_title']].drop_duplicates().reset_index(drop=True)

# Merge in descriptions
df_prods = df_prods.merge(df_desc, how = 'left', on = 'product_uid')

# Collect attributes
attr_ids = []
attr_dicts = []
for id in df_attr['product_uid'].unique():
    attr_ids.append(id)
    attr_tmp = df_attr[df_attr['product_uid']==id]
    if len(attr_tmp)>0:
        attrs = {}
        for index, row in attr_tmp.iterrows():
            attrs[row['name']] = row['value']
        attr_dicts.append(attrs)
    else:
        attr_dicts.append(None)

df_attributes = pd.DataFrame({'product_uid': attr_ids,
                              'product_attributes': attr_dicts})     

# Merge in attributes
df_prods = df_prods.merge(df_attributes, how = 'left', on = 'product_uid')

df_prods.to_csv('processed_data/test_data/df_test_prods.csv', index=False)

In [11]:
def convert_to_dict(string):
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        return None  # Handle cases where the string is not a valid dictionary representation

def dict_to_string(my_dict):
    result_str = ""
    for key, value in my_dict.items():
        result_str += str(key) + ' ' + str(value) + ' '
    return result_str

In [10]:
df_prods = pd.read_csv('processed_data/test_data/df_test_prods.csv')

df_prods['product_attributes'] = df_prods['product_attributes'].apply(convert_to_dict)
df_prods['product_attributes_string'] = [dict_to_string(x) if x is not None else x for x in df_prods['product_attributes']]
df_prods['product_text_string'] = df_prods['product_title'].fillna('') + ' ' + df_prods['product_description'].fillna('') + ' ' + df_prods['product_attributes_string'].fillna('') 

In [12]:
def generate_id(text):
    hash_object = hashlib.sha256(text.encode('utf-8'))  # SHA-256 example
    hex_digest = hash_object.hexdigest()
    return hex_digest

In [13]:
## Query Data
df_queries = pd.DataFrame(df['search_term'].unique()).rename(columns={0: 'search_term'})
df_queries['query_id'] = [generate_id(x) for x in df_queries['search_term']]
df_queries['has_relevant_results'] = df_queries['search_term'].isin(df[df['relevance']>0]['search_term']).astype(int)

df_queries.to_csv('processed_data/test_data/df_test_queries.csv', index=False)

df_queries.head()

Unnamed: 0,search_term,query_id,has_relevant_results
0,90 degree bracket,d9a0dddfafa498b8042cca1c26f061679dcb3a23840079...,1
1,metal l brackets,84c87102bd2789d4d946b3924f319f466923a69c3cc373...,1
2,simpson sku able,f15e1eda53817ef17b26ef45977492190f604db7a12ce4...,1
3,simpson strong ties,a74edca209bbc084e7d72b351203715898da0e5561f11b...,1
4,simpson strong tie hcc668,2107aa7f3726c800f22cf7375517e6a02ed82f772fcb81...,1


In [14]:
## Relevance

df_relevance = df.merge(df_queries, how = 'left', on = 'search_term')
df_relevance = df_relevance[['query_id', 'product_uid', 'relevance']]
df_relevance.to_csv('processed_data/test_data/df_test_relevance.csv', index=False)
df_relevance.head()

Unnamed: 0,query_id,product_uid,relevance
0,d9a0dddfafa498b8042cca1c26f061679dcb3a23840079...,100001,-1.0
1,84c87102bd2789d4d946b3924f319f466923a69c3cc373...,100001,2.33
2,f15e1eda53817ef17b26ef45977492190f604db7a12ce4...,100001,2.33
3,a74edca209bbc084e7d72b351203715898da0e5561f11b...,100001,2.67
4,2107aa7f3726c800f22cf7375517e6a02ed82f772fcb81...,100001,2.0


In [15]:
# Remove any query product pairs that are in our previously embedded set
df_train_relevance = pd.read_csv('processed_data/df_relevance.csv')

identified_duplicates = pd.merge(df_relevance, df_train_relevance[['query_id', 'product_uid']], 
                                  on=['query_id', 'product_uid'], how='inner')

df_relevance = df_relevance[~df_relevance.index.isin(identified_duplicates.index)]

# Fine-Tuning

## Data Preperation

In [16]:
prompt = 'Represent this sentence for searching relevant passages: '

positive_df = df_relevance[df_relevance['relevance']==3].copy()
positive_df = positive_df.merge(df_prods[['product_uid', 'product_text_string']], how = 'left', on = 'product_uid')
positive_df = positive_df.merge(df_queries[['search_term', 'query_id']], how = 'left', on = 'query_id')
positive_df['search_query'] = [prompt + str(x) for x in positive_df['search_term']]

train_data, test_data = train_test_split(positive_df, test_size=0.1, random_state=12)

In [17]:
batch_size = 16

train_examples = [
    InputExample(texts=[t1, t2])
    for t1, t2 in zip(
        train_data['search_query'], train_data['product_text_string']
    )
]

loader = datasets.NoDuplicatesDataLoader(
    train_examples, batch_size=batch_size
)

In [18]:
# Take positive pairs and create random negative pairs from the test set
pos_test = test_data[['product_text_string', 'search_query']].reset_index(drop=True).copy()
pos_test['similarity'] = 1

np.random.seed(12)
pos_test['rand_merge_1'] = np.random.choice(range(len(pos_test)), size=len(pos_test), replace=False)
pos_test['rand_merge_2'] = np.random.choice(range(len(pos_test)), size=len(pos_test), replace=False)

neg_test = pos_test[['product_text_string', 'rand_merge_1']].merge(pos_test[['search_query', 'rand_merge_2']], how = 'left', left_on = 'rand_merge_1', right_on = 'rand_merge_2')
neg_test['similarity'] = 0

pos_neg_test = pd.concat([pos_test, neg_test])
pos_neg_test = pos_neg_test.sample(frac=1)


evaluator = evaluation.EmbeddingSimilarityEvaluator(
    list(pos_neg_test["search_query"]),
    list(pos_neg_test["product_text_string"]),
    [float(x) for x in pos_neg_test["similarity"]],
    batch_size=batch_size
)

## Load Baseline and Fine-Tune Model

In [19]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [20]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

transformer = models.Transformer('Snowflake/snowflake-arctic-embed-m')

pooler = models.Pooling(
    transformer.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(
    modules=[transformer, pooler],
    device=device
)
print(model)

Using cpu device


Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-m and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


In [None]:
pretrain_similarity = evaluator(model)
pretrain_similarity

```{'pearson_cosine': 0.8764524487588595,
 'spearman_cosine': 0.8487036957203093,
 'pearson_manhattan': 0.828606164662169,
 'spearman_manhattan': 0.8285303972567875,
 'pearson_euclidean': 0.8315322713504393,
 'spearman_euclidean': 0.83022571909546,
 'pearson_dot': 0.8748335656385433,
 'spearman_dot': 0.8492356027098478,
 'pearson_max': 0.8764524487588595,
 'spearman_max': 0.8492356027098478}```

In [21]:
loss = losses.MultipleNegativesRankingLoss(model)

epochs = 10
warmup_steps = int(len(loader) * epochs * 0.1)

num_steps_per_epoch = len(loader)
total_steps = (len(loader) / batch_size) * epochs
eval_steps = num_steps_per_epoch // 2

In [None]:
model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    evaluator=evaluator,
    evaluation_steps=eval_steps,
    warmup_steps=warmup_steps,
    show_progress_bar=True,
)

| Step  | Training Loss | Validation Loss | Pearson Cosine | Spearman Cosine | Pearson Manhattan | Spearman Manhattan | Pearson Euclidean | Spearman Euclidean | Pearson Dot | Spearman Dot | Pearson Max | Spearman Max |
|-------|---------------|-----------------|----------------|-----------------|-------------------|--------------------|-------------------|--------------------|-------------|--------------|-------------|--------------|
| 806   | 0.135900      | No log          | 0.924130       | 0.859629        | 0.906357          | 0.858088           | 0.907278          | 0.858170           | 0.916731    | 0.859120     | 0.924130    | 0.859629     |
| 1612  | 0.063700      | No log          | 0.930849       | 0.860865        | 0.915001          | 0.860061           | 0.915904          | 0.860171           | 0.921644    | 0.860218     | 0.930849    | 0.860865     |
| 2418  | 0.049000      | No log          | 0.933863       | 0.861664        | 0.918908          | 0.861364           | 0.919644          | 0.861518           | 0.924173    | 0.860963     | 0.933863    | 0.861664     |
| 3224  | 0.017200      | No log          | 0.935362       | 0.861663        | 0.919747          | 0.861529           | 0.920262          | 0.861535           | 0.926153    | 0.861065     | 0.935362    | 0.861663     |
| 4030  | 0.007400      | No log          | 0.935997       | 0.862507        | 0.919293          | 0.862325           | 0.919973          | 0.862370           | 0.926513    | 0.861935     | 0.935997    | 0.862507     |
| 4836  | 0.006000      | No log          | 0.936650       | 0.862111        | 0.919439          | 0.861762           | 0.920107          | 0.861893           | 0.927554    | 0.861589     | 0.936650    | 0.862111     |
| 5642  | 0.004000      | No log          | 0.937880       | 0.862215        | 0.920892          | 0.862062           | 0.921706          | 0.862196           | 0.929511    | 0.861718     | 0.937880    | 0.862215     |
| 6448  | 0.003300      | No log          | 0.938330       | 0.862047        | 0.921047          | 0.861927           | 0.921739          | 0.862072           | 0.930045    | 0.861569     | 0.938330    | 0.862072     |
| 7254  | 0.002100      | No log          | 0.936754       | 0.862297        | 0.918734          | 0.862152           | 0.919268          | 0.862269           | 0.927864    | 0.861827     | 0.936754    | 0.862297     |
| 8060  | 0.001500      | No log          | 0.938134       | 0.861851        | 0.920651          | 0.861894           | 0.921173          | 0.862000           | 0.928700    | 0.861206     | 0.938134    | 0.862000     |
| 8866  | 0.001500      | No log          | 0.938488       | 0.862100        | 0.920412          | 0.862171           | 0.921088          | 0.862335           | 0.930219    | 0.861655     | 0.938488    | 0.862335     |
| 9672  | 0.001300      | No log          | 0.939361       | 0.861918        | 0.921635          | 0.862023           | 0.922244          | 0.862159           | 0.930565    | 0.861448     | 0.939361    | 0.862159     |
| 10478 | 0.001000      | No log          | 0.939014       | 0.862050        | 0.920826          | 0.862177           | 0.921358          | 0.862278           | 0.930214    | 0.861611     | 0.939014    | 0.862278     |
| 11284 | 0.001500      | No log          | 0.939657       | 0.861916        | 0.921790          | 0.862133           | 0.922281          | 0.862239           | 0.930523    | 0.861401     | 0.939657    | 0.862239     |
| 12090 | 0.000600      | No log          | 0.939410       | 0.862178        | 0.921517          | 0.862399           | 0.922049          | 0.862521           | 0.930494    | 0.861686     | 0.939410    | 0.862521     |
| 12896 | 0.000800      | No log          | 0.938943       | 0.861639        | 0.920460          | 0.862017           | 0.920930          | 0.862122           | 0.930085    | 0.861131     | 0.938943    | 0.862122     |
| 13702 | 0.000400      | No log          | 0.938404       | 0.861748        | 0.919368          | 0.862102           | 0.919783          | 0.862244           | 0.929538    | 0.861267     | 0.938404    | 0.862244     |
| 14508 | 0.000400      | No log          | 0.938728       | 0.861622        | 0.919922          | 0.862018           | 0.920329          | 0.862139           | 0.929563    | 0.861129     | 0.938728    | 0.862139     |
| 15314 | 0.000300      | No log          | 0.938327       | 0.861611        | 0.919238          | 0.862039           | 0.919665          | 0.862166           | 0.929277    | 0.861128     | 0.938327    | 0.862166     |
| 16120 | 0.000300      | No log          | 0.938794       | 0.861668        | 0.919764          | 0.862051           | 0.920197          | 0.862189           | 0.929890    | 0.861168     | 0.938794    | 0.862189     |

In [None]:
finetune_similarity = evaluator(model)
finetune_similarity

```{'pearson_cosine': 0.9387940952734546,
 'spearman_cosine': 0.861667612181049,
 'pearson_manhattan': 0.9197639425972355,
 'spearman_manhattan': 0.8620514864829285,
 'pearson_euclidean': 0.9201973979926452,
 'spearman_euclidean': 0.8621892011430952,
 'pearson_dot': 0.9298899159239588,
 'spearman_dot': 0.861167712275619,
 'pearson_max': 0.9387940952734546,
 'spearman_max': 0.8621892011430952}```