# Exploratory Data Analysis

In [1]:
from sentence_transformers import SentenceTransformer

In [1]:
from datasets import load_dataset

In [4]:
train_dataset = load_dataset("trec-product-search/Product-Search-Triples", split="train")

In [5]:
train_dataset

Dataset({
    features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
    num_rows: 20888
})

Dataset is made such that, for each text query we have multiple positive and negative passages.

Considering that, on average, we have 20 positive passages per anchor and 100 negatives, this means we have 20888*20 = 417760 (anchor, positive) pairs. Each of this pair will have approx 100 negative pairs to be trained with --> **4M rows in total**. 

For the assigment purposes, is not possible to train on the whole dataset, therefore I decided to down-sample it.

**How do we downsample it for training?**
- I take only **one positive and negative passage per query** - the idea behidn is to at least guarantee 100% coverage for all the queries 
- If still doesn't work I will randomly downsample the remaining dataet (spoiler: I will have to do it)

**Which loss function is suited for the dataset?**  
Considering the simplified setup, both MultipleNegativesRankingLoss and TripletLoss would work.

In [6]:
cnt = 0
for passages in train_dataset['positive_passages']:
    for passage in passages:
        cnt += 1
print("train: " + str(cnt/len(train_dataset)))

train: 18.77245308310992


In [7]:
cnt = 0
for passages in train_dataset['negative_passages']:
    for passage in passages:
        cnt += 1
print("train: " + str(cnt/len(train_dataset)))

train: 98.42378399080812


### Investigating query lenght against positive and negative passages

There are three main insights about this:
- in general texts are short and none that exceeded token limit of 512
- query lenght is shorter than title lenght
- test dataset do not contain any passage

Given that:
- I choose a max_len that is 99th percentile to optimise model training and inference (since I'm working on local cpu)
- I choose pre-trained models specialised in asymmetric semantic search retrieval (more info https://sbert.net/examples/applications/semantic-search/README.html#examples)
- For model evaluation I will split dev dataset into two, one for validation and one for testing against the baseline

**What about texts longer than 99th percentile??**  
I am going to truncate the remaining aprt of the text, since 99th percentile is way lower the max model lenght.  

I am not going to consider remaining chunks because it would over complicate things since the tokenizer would create n>1 chunks for each query:
- I would need to decide how to treat them: for example I might want to add some stride among chunks
- I would define some data processing to give each chunk same query
- I would consider custom training: for example a custom loss which is the mean across the  chunks because 


In [10]:
# data processing accordingly to what I've said above to simplify the calculations
def process_dataset(dataset):
    def process_triplet(example):
        has_valid_data = (
            isinstance(example['positive_passages'], list) and 
            isinstance(example['negative_passages'], list) and 
            len(example['positive_passages']) > 0 and 
            len(example['negative_passages']) > 0
        )
        
        if has_valid_data:
            processed =  {
                'query': example['query'],
                'positive_passages': example['positive_passages'][0]['title'],
                'negative_passages': example['negative_passages'][0]['title']
            }
        else:
            # Return a placeholder with is_valid=False instead of None
            processed = {
                'query': '',  # Empty placeholder
                'positive_passages': '',
                'negative_passages': ''
            }
    
        # Add a validation flag
        processed['valid'] = bool(has_valid_data)
        return processed
    
    # Apply the processing function to the dataset
    processed_dataset = dataset.map(process_triplet)
    # Then filter to keep only valid examples
    final_dataset = processed_dataset.filter(lambda x: x['valid'])
    
    # Remove the validation flag since we don't need it anymore
    final_dataset = final_dataset.remove_columns(['query_id','valid'])
    return final_dataset

In [11]:
train_dataset = process_dataset(load_dataset("trec-product-search/Product-Search-Triples", split="train"))
eval_dataset = process_dataset(load_dataset("trec-product-search/Product-Search-Triples", split="dev"))
test_dataset = process_dataset(load_dataset("trec-product-search/Product-Search-Triples", split="test"))

In [14]:
import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')
# Function to calculate lengths
def get_length_stats(texts):
    lengths = [len(tokenizer.encode(text)) for text in texts]
    return {
        'mean': np.mean(lengths),
        'std': np.std(lengths),
        'min': np.min(lengths),
        'max': np.max(lengths),
        'distribution': lengths
    }

anchor_stats = get_length_stats(train_dataset['query'])
positive_stats = get_length_stats(train_dataset['positive_passages'])
negative_stats = get_length_stats(train_dataset['negative_passages'])

# Print statistics
print(f"Anchor texts - Mean: {anchor_stats['mean']:.2f}, 99th: {anchor_stats['mean']+anchor_stats['std']*2.3:.2f}, Max: {anchor_stats['max']:.2f}, Min: {anchor_stats['min']:.2f}")
print(f"Positive texts - Mean: {positive_stats['mean']:.2f}, 99th: {positive_stats['mean']+positive_stats['std']*2.3:.2f}, Max: {positive_stats['max']:.2f}, Min: {positive_stats['min']:.2f}")
print(f"Negative texts - Mean: {negative_stats['mean']:.2f}, 99th: {negative_stats['mean']+ negative_stats['std']*2.3:.2f}, Max: {negative_stats['max']:.2f}, Min: {negative_stats['min']:.2f}")


Anchor texts - Mean: 7.10, 99th: 12.67, Max: 38.00, Min: 3.00
Positive texts - Mean: 28.79, 99th: 62.13, Max: 109.00, Min: 2.00
Negative texts - Mean: 28.98, 99th: 61.44, Max: 322.00, Min: 2.00


In [16]:
anchor_stats = get_length_stats(eval_dataset['query'])
positive_stats = get_length_stats(eval_dataset['positive_passages'])
negative_stats = get_length_stats(eval_dataset['negative_passages'])

# Print statistics
print(f"Anchor texts - Mean: {anchor_stats['mean']:.2f}, 99th: {anchor_stats['mean']+anchor_stats['std']*2.3:.2f}, Max: {anchor_stats['max']:.2f}, Min: {anchor_stats['min']:.2f}")
print(f"Positive texts - Mean: {positive_stats['mean']:.2f}, 99th: {positive_stats['mean']+positive_stats['std']*2.3:.2f}, Max: {positive_stats['max']:.2f}, Min: {positive_stats['min']:.2f}")
print(f"Negative texts - Mean: {negative_stats['mean']:.2f}, 99th: {negative_stats['mean']+ negative_stats['std']*2.3:.2f}, Max: {negative_stats['max']:.2f}, Min: {negative_stats['min']:.2f}")


Anchor texts - Mean: 7.09, 99th: 12.60, Max: 36.00, Min: 3.00
Positive texts - Mean: 28.68, 99th: 61.84, Max: 117.00, Min: 2.00
Negative texts - Mean: 28.87, 99th: 61.01, Max: 172.00, Min: 2.00


In [19]:
test_dataset

Dataset({
    features: ['query', 'positive_passages', 'negative_passages'],
    num_rows: 0
})

### Investigate if there are too many out of bag words for our model

There are no infrequent words we need to deal with

In [21]:
import numpy as np
from collections import Counter


def analyze_unknown_tokens(texts, text_type):
    """
    Analyzes the presence of unknown tokens in a collection of texts.
    We'll track both the frequency of unknown tokens and which original words tend to cause them.
    """
    # Counter for unknown tokens
    unknown_token_count = 0
    total_token_count = 0
    
    # Track which words often lead to unknown tokens
    problematic_words = Counter()
    
    # The unknown token ID for this tokenizer
    unknown_token_id = tokenizer.unk_token_id
    
    for text in texts:
        # Get both tokens and their IDs
        tokens = tokenizer.encode(text, add_special_tokens=False)
        words = text.split()
        
        # If we find unknown tokens, let's see which words might have caused them
        if unknown_token_id in tokens:
            # Tokenize each word separately to find problematic ones
            for word in words:
                word_tokens = tokenizer.encode(word, add_special_tokens=False)
                if unknown_token_id in word_tokens:
                    problematic_words[word] += 1
        
        # Count unknowns in this text
        unknown_token_count += tokens.count(unknown_token_id)
        total_token_count += len(tokens)
    
    # Calculate percentage
    unknown_percentage = (unknown_token_count / total_token_count * 100) if total_token_count > 0 else 0
    
    print(f"\nAnalysis for {text_type}:")
    print(f"Total tokens: {total_token_count}")
    print(f"Unknown tokens: {unknown_token_count} ({unknown_percentage:.2f}%)")
    
    
    return unknown_percentage, problematic_words

# Analyze query texts
query_unknown = analyze_unknown_tokens(train_dataset['query'], 'Query texts')

# Analyze positive texts
positive_unknown = analyze_unknown_tokens(train_dataset['positive_passages'], 'Positive texts')

# Analyze negative texts
negative_unknown = analyze_unknown_tokens(train_dataset['negative_passages'], 'Negative texts')


Analysis for Query texts:
Total tokens: 105997
Unknown tokens: 0 (0.00%)

Analysis for Positive texts:
Total tokens: 557099
Unknown tokens: 121 (0.02%)

Analysis for Negative texts:
Total tokens: 560969
Unknown tokens: 64 (0.01%)


### Preparing test dataset for IR evaluator. 

For evaluation purposes I prefer to use the **InformationRetrievalEvaluator** to test model performances on semantic search (which is the final goal of the fine tuning).   
For training I will instead use the **TripletEvaluator** which only measure the cosine similarity accuracy.
To do so I need to have a list of relevant passages for each query: I will use the positive passages from each query.  

Test dataset will be split into:
- queries: Dict[str] containing all the test query_id:text values
- relevant_docs: Dict[List[str]], containing for each query_id, the positive passages docid
- corpus: Dict[str], containing mapping docid: text


In [2]:
# split dev dataset into validation dataset for training, and test dataset for model evaluation
eval_dataset_raw = load_dataset("trec-product-search/Product-Search-Triples", split="train")
split_dataset = eval_dataset_raw.train_test_split(test_size=0.05)
eval_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [3]:
from collections import defaultdict

def process_test_dataset(dataset):
    
    # Initialize our output dictionaries
    queries: Dict[str, str] = {}
    relevant_docs: Dict[str, List[str]] = defaultdict(list)
    corpus: Dict[str, str] = {}
    
    # Process each split in the dataset
    for sample in dataset:
        query_id = sample['query_id']
        query = sample['query']
        positives = sample['positive_passages']
        negatives = sample['negative_passages']
        
        # Store query
        queries[query_id] = query
        
        # Process positive documents
        for pos_doc in positives:
            doc_id = pos_doc['docid']
            title = pos_doc['title']
            relevant_docs[query_id].append(doc_id)
            corpus[doc_id] = title
            
        # Process negative documents (for corpus only)
        for neg_doc in negatives:
            doc_id = neg_doc['docid']
            title = neg_doc['title']
            corpus[doc_id] = title
    
    return queries, relevant_docs, corpus

In [4]:
queries, relevant_docs, corpus = process_test_dataset(test_dataset)

In [7]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator


# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR metrics.
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-6-v3')
ir_evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name="trec-dataset-test",
)
results = ir_evaluator(model)

In [8]:
results

{'trec-dataset-test_cosine_accuracy@1': 0.3946360153256705,
 'trec-dataset-test_cosine_accuracy@3': 0.5670498084291188,
 'trec-dataset-test_cosine_accuracy@5': 0.6388888888888888,
 'trec-dataset-test_cosine_accuracy@10': 0.7375478927203065,
 'trec-dataset-test_cosine_precision@1': 0.3946360153256705,
 'trec-dataset-test_cosine_precision@3': 0.33876117496807157,
 'trec-dataset-test_cosine_precision@5': 0.3065134099616858,
 'trec-dataset-test_cosine_precision@10': 0.2522030651340996,
 'trec-dataset-test_cosine_recall@1': 0.023488977943643224,
 'trec-dataset-test_cosine_recall@3': 0.059354064751735096,
 'trec-dataset-test_cosine_recall@5': 0.08781059630679747,
 'trec-dataset-test_cosine_recall@10': 0.14169269594896383,
 'trec-dataset-test_cosine_ndcg@10': 0.282676791488999,
 'trec-dataset-test_cosine_mrr@10': 0.5003679377242592,
 'trec-dataset-test_cosine_map@100': 0.1683177674005804}