In [None]:
import pandas as pd
import logging
import os
import matplotlib.pyplot as plt
import seaborn as sns
from pipeline import (
    load_config,
    SBERTEmbedder,
    BERTEmbedder,
    DataAccessLayer,
    concatenate_columns,
    divide_dataset,
    dataset_subset,
    vectorize_and_save,
    match_compositions,
    result_dist
)
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from losses import get_loss_function
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

# Semantic Search PoC

## Utilities

In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration management
CONFIG_PATH = 'config.json'
config = load_config(CONFIG_PATH)

Pandas display setting management section:

In [None]:
#pd.set_option('display.float_format', lambda x: f'{x:.2f}') # Format float display to 2 decimal places
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)

In [None]:
#pd.reset_option('display.float_format')
#pd.reset_option('display.max_rows')
#pd.reset_option('display.max_colwidth')

## Dataset

In [3]:
# Load dataset
dal = DataAccessLayer(config['data_path'])
dataset = dal.load_data()

qa = pd.read_csv(config["qa_path"])

canonical_cols_inference = ['CAN_ID', 'CAN_Title', 'CAN_Writers_Formatted']
matched_cols_inference = ['CAN_ID', 'MATCHED_Title', 'MATCHED_Writer_1']

canonical_df, matched_df = divide_dataset(dataset, canonical_cols_inference, matched_cols_inference)

matched_subset = dataset_subset(matched_df, sample_size=config['sample_size'], seed=config['random_state'])

## Implement Pre-trained BERT

In [None]:
bert_embedder = BERTEmbedder(model_name=config['bert_model_name'], batch_size=64)  # Adjust batch size based on your GPU memory

# Vectorize canonical dataset
bert_canonical_vectors_path = config['bert_canonical_vectors_path']
if not os.path.exists(bert_canonical_vectors_path):
    canonical_vectors = vectorize_and_save(canonical_df, bert_embedder, bert_canonical_vectors_path, dal) # Vectorise all canonical compoistions
else:
    canonical_vectors = dal.load_vectors(bert_canonical_vectors_path)

results_df = match_compositions(matched_subset, canonical_vectors, bert_embedder, canonical_df)
results_df.to_csv(config['results_path'], index=False)
logger.info("Matching completed and results saved.")

## BERT Performance Analysis

### Accuracy

In [None]:
results = pd.read_csv(config['results_path'])
results['Correct_Match'].value_counts()

In [None]:
results.sort_values(['Correct_Match', 'Similarity_Score'], ascending=[False, False], inplace=True)
match_mask = (results['Correct_Match'] == True) & (results['Similarity_Score'] < 1) & ~(results['CAN_Title'].str.contains('pump'))

results[match_mask]

### Similarity score distribution

In [None]:
result_dist(results)

### Quick QA:

In [None]:
mask = qa['MATCHED_Title'].str.lower().str.contains('pull up')
qa[mask].drop_duplicates('MATCHED_Comp').sort_values('CAN_Title')

## Implement Pre-trained SBERT

In [None]:
# Load pre-trained SBERT model
sbert_embedder = SBERTEmbedder(model_name=config['sbert_model_name'], batch_size=config['batch_size'])  # Adjust batch size based on your GPU memory

# Vectorize canonical dataset
s_bert_canonical_vectors_path = config['s_bert_canonical_vectors_path']
if not os.path.exists(s_bert_canonical_vectors_path):
    canonical_vectors = vectorize_and_save(canonical_df, sbert_embedder, s_bert_canonical_vectors_path, dal) # Vectorise all canonical compoistions
else:
    canonical_vectors = dal.load_vectors(s_bert_canonical_vectors_path)

results_df = match_compositions(matched_subset, canonical_vectors, sbert_embedder, canonical_df)
results_df.to_csv(config['results_path'], index=False)
logger.info("Matching completed and results saved.")

## SBERT Performance Analysis

### Accuracy

In [None]:
results = pd.read_csv(config['results_path'])
results['Correct_Match'].value_counts()

In [None]:
results.sort_values(['Correct_Match', 'Similarity_Score'], ascending=[False, False], inplace=True)
match_mask = (results['Correct_Match'] == True) & (results['Similarity_Score'] < 1) & ~(results['CAN_Title'].str.contains('pump'))

results[match_mask]

### Similarity score distribution

In [None]:
result_dist(results)

### Quick QA:

In [None]:
mask = qa['MATCHED_Title'].str.lower().str.contains('pull up')
qa[mask].drop_duplicates('MATCHED_Comp').sort_values('CAN_Title')

## Fine-tune SBERT

In [None]:
# Split dataset into training and evaluation
dataset = dataset_subset(dataset, sample_size=1000, seed=27) #move to params
#train_df, eval_df = split_dataset(dataset, sort_column='CAN_ID', random_state=config['random_state'], fine_tuning_ratio=config['fine_tuning_ratio'])

canonical_cols_training = ['CAN_Title', 'CAN_Writers_Formatted']
matched_cols_training = ['MATCHED_Title', 'MATCHED_Writer_1']

canonical_texts, matched_texts = divide_dataset(dataset, canonical_cols_training, matched_cols_training)

canonical_texts = concatenate_columns(canonical_texts, ['CAN_Title', 'CAN_Writers_Formatted'])
matched_texts = concatenate_columns(matched_texts, ['MATCHED_Title', 'MATCHED_Writer_1'])

training_df = pd.concat([canonical_texts, matched_texts], axis=1).reset_index(drop=True) # SentenceTransformerTrainer seems to require df w/ the default index
training_dataset = Dataset.from_pandas(training_df)

# Load pre-trained SBERT model
model = sbert_embedder.model

# Fine-tune the model
# 4. Define a loss function
loss = get_loss_function(model)

# 5. Specify training arguments
args = SentenceTransformerTrainingArguments(
output_dir=config['training_output_dir']
# Optional training parameters:
)

# 6. Create a trainer & train
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=training_dataset,
#eval_dataset=eval_dataset,
loss=loss,
#evaluator=dev_evaluator
)
trainer.train()

# 7. Save the trained model
model.save_pretrained(config['fine_tuned_model_name'])

## Test the Fine-tuned Model

In [None]:
# Load the fine-tuned SBERT model
sbert_embedder.model = SentenceTransformer(config['fine_tuned_model_name'])  # Adjust batch size based on your GPU memory

# Vectorize canonical dataset
fine_tuned_s_bert_canonical_vectors_path = config['fine_tuned_s_bert_canonical_vectors_path']
if not os.path.exists(fine_tuned_s_bert_canonical_vectors_path):
    canonical_vectors = vectorize_and_save(canonical_df, sbert_embedder, fine_tuned_s_bert_canonical_vectors_path, dal) # Vectorise all canonical compoistions
else:
    canonical_vectors = dal.load_vectors(fine_tuned_s_bert_canonical_vectors_path)

results_df = match_compositions(matched_subset, canonical_vectors, sbert_embedder, canonical_df)
results_df.to_csv(config['results_path'], index=False)
logger.info("Matching completed and results saved.")

## Fine-tune SBERT Performance Analysis

### Accuracy

In [None]:
results = pd.read_csv(config['results_path'])
results['Correct_Match'].value_counts()

In [None]:
results.sort_values(['Correct_Match', 'Similarity_Score'], ascending=[False, False], inplace=True)
match_mask = (results['Correct_Match'] == True) & (results['Similarity_Score'] < 1) & ~(results['CAN_Title'].str.contains('pump'))

results[match_mask]

### Similarity score distribution

In [None]:
result_dist(results)

### Quick QA:

In [None]:
mask = qa['MATCHED_Title'].str.lower().str.contains('pull up')
qa[mask].drop_duplicates('MATCHED_Comp').sort_values('CAN_Title')