# Step-1: Import all the dependencies

In [None]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!pip install -U sentence-transformers #install sentence transformer models
!pip install torch==1.11.0+cu113 torchvision -f https://download.pytorch.org/whl/torch_stable.html #to install pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 45.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 56.6 MB/s 
[?25hCollecting huggingface-hub>=0.

In [None]:
from torch.utils.data import DataLoader
import torch
import math
import pandas as pd
import numpy as np
import sentence_transformers
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime as dt
import sys
import os
import csv
import gzip
from google.colab import drive 
import sklearn
# import torch.nn as nn
# import torch.nn.functional as F
# import torchvision
# import torchvision.transforms as transforms

In [None]:
# !pip list -v | grep torch #check if torch is installed
# !pip list -v | grep sentence-transformer

In [None]:
# #print pytorch version
# import torch
# print(torch.__version__)

In [None]:
print(torch.cuda.is_available()) #Check if GPU is available
print(torch.cuda.device_count()) #To check how many CUDA supported GPU’s are connected to the machine
print(torch.cuda.get_device_name(0)) #name of the GPU Card connected to the machine

True
1
Tesla P100-PCIE-16GB


In [None]:
### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

# Step-2: Import the dataset

In [None]:
# 1. Authorize Google Drive (required only for the first time, then comment it)
# drive.mount('/content/gdrive', force_remount=True)

# Read the CSV file from Google Drive saved in the above path
df = pd.read_excel('/content/gdrive/My Drive/Colab Notebooks/Seniority_classification/GT2 seniority sentences_Nurse_v2_2022-07-26.xlsx', sheet_name='Sen_pair_master2')
print(df.shape)
df.head()

(282, 12)


Unnamed: 0,Pair_ID,ClassSpecID1,ClassSpecID2,Class_Title1,Class_Title2,EmployerID1,EmployerID2,Same_emp_ind,BLS_Detail,Sen_text1,Sen_text2,human seniority similarity score
0,3308-4916,3308,4916,Nurse Practitioner,Family Nurse Practitioner,168,141,0,Nurse Practitioners,A valid Registered Nurse certificate issued by...,"As directed, evaluates nursing services, recom...",-1
1,3308-63345,3308,63345,Nurse Practitioner,Nurse Practitioner I,168,1257,0,Nurse Practitioners,A valid Registered Nurse certificate issued by...,Graduation from an accredited school of nursin...,-1
2,3308-76219,3308,76219,Nurse Practitioner,Nurse Practitioner I,168,421,0,Nurse Practitioners,A valid Registered Nurse certificate issued by...,Candidates will be required to obtain a Drug D...,-1
3,3308-76227,3308,76227,Nurse Practitioner,Nurse Practitioner II,168,421,0,Nurse Practitioners,A valid Registered Nurse certificate issued by...,Candidates will be required to obtain a Drug D...,-1
4,3308-716754,3308,716754,Nurse Practitioner,Nurse Practitioner II,168,1597,0,Nurse Practitioners,A valid Registered Nurse certificate issued by...,An employee in this class is responsible for n...,-1


In [None]:
df.columns

Index(['Pair_ID', 'ClassSpecID1', 'ClassSpecID2', 'Class_Title1',
       'Class_Title2', 'EmployerID1', 'EmployerID2', 'Same_emp_ind',
       'BLS_Detail', 'Sen_text1', 'Sen_text2',
       'human seniority similarity score'],
      dtype='object')

In [None]:
#Check if dataset exist. If not, download and extract it
# sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

# if not os.path.exists(sts_dataset_path):
#     util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)

# Step-3: Specify the pretrained sentence transformer model name
List is here: https://huggingface.co/sentence-transformers

In [None]:
#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
# model_name = SentenceTransformer('sentence-transformers/sentence-t5-xl') #sys.argv[1] if len(sys.argv) > 1 else 
model_name = 'nli-distilroberta-base-v2' #sys.argv[1] if len(sys.argv) > 1 else 
train_batch_size = 16
num_epochs = 10
model_save_path = 'output/training_stsbenchmark_'+model_name+'-'+dt.now().strftime("%Y-%m-%d_%H-%M-%S")

# Step-4: Specify the word embedding and pooling models

In [None]:
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = SentenceTransformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_sentence_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/679 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

# Step-5: Convert the dataset to a DataLoader ready for training

In [None]:
logging.info("Read STSbenchmark train dataset")

#Rename columns in the dataframe
df.rename(columns={'Sen_text1':'sentence1', 'Sen_text2':'sentence2','human seniority similarity score':'score'}, inplace=True)
df['score'] = df['score'].astype(float)
# df = df.reset_index()  # make sure indexes pair with number of rows

from sklearn.model_selection import train_test_split

# Train-test-dev split
train, test = train_test_split(df, test_size=0.2, random_state=1)
train, dev = train_test_split(train, test_size=0.25, random_state=1)

# print(train.shape)
# print(test.shape)
# print(dev.shape)

# Assuming train, dev, test are dataframes
# A string is assigned to the "split" column.
train.loc[:,'split'] = 'train'
dev.loc[:,'split'] = 'dev'
test.loc[:,'split'] = 'test'

# Concatenate all the dataframe together
df1 = pd.concat([train, dev, test], axis=0)
# print(df1.info())


train_samples = []
dev_samples = []
test_samples = []

for index, row in df1.iterrows():
  inp_example = InputExample(guid= row['Pair_ID'],texts=[row['sentence1'], row['sentence2']], label=row['score'])

  if row['split'] == 'dev':
      dev_samples.append(inp_example)
  elif row['split'] == 'test':
      test_samples.append(inp_example)
  else:
      train_samples.append(inp_example)

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
print(train_samples[0].__dict__)
print(dev_samples[0].__dict__)

{'guid': '885-76219', 'texts': ["A Bachelor's degree from a recognized college in Nursing, Public Health, Health Sciences or a closely related field or a Masters Degree from a recognized college in Nursing, Public Health, Health Sciences or closely related field or a Current certification as a Nurse Practitioner in a field of specialty by a recognized National Board is required. Certification has to be maintained to continue in this level for those who do not have either a Bachelors or Masters degree. Current certification as a Nurse Practitioner in a field of specialty by a recognized National Board is required in this level for those who do not have either a Bachelors or Masters degree in appropriate field as described above under Education. Incumbents of positions in this series are registered nurses with additional preparation and skills in physical diagnosis. Must possess and maintain a current valid license to practice as a Registered Nurse in the State of California. Possession 

In [None]:
# train.to_csv('train.csv', index=False)
print(train.info())
print(train.describe())
train.isnull().sum()
# train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168 entries, 72 to 93
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Pair_ID       168 non-null    object 
 1   ClassSpecID1  168 non-null    int64  
 2   ClassSpecID2  168 non-null    int64  
 3   Class_Title1  168 non-null    object 
 4   Class_Title2  168 non-null    object 
 5   EmployerID1   168 non-null    int64  
 6   EmployerID2   168 non-null    int64  
 7   Same_emp_ind  168 non-null    int64  
 8   BLS_Detail    168 non-null    object 
 9   sentence1     168 non-null    object 
 10  sentence2     168 non-null    object 
 11  score         168 non-null    float64
 12  split         168 non-null    object 
dtypes: float64(1), int64(5), object(7)
memory usage: 18.4+ KB
None
       ClassSpecID1  ClassSpecID2  EmployerID1  EmployerID2  Same_emp_ind  \
count  1.680000e+02  1.680000e+02   168.000000   168.000000    168.000000   
mean   4.677776e+05  5.33775

Pair_ID         0
ClassSpecID1    0
ClassSpecID2    0
Class_Title1    0
Class_Title2    0
EmployerID1     0
EmployerID2     0
Same_emp_ind    0
BLS_Detail      0
sentence1       0
sentence2       0
score           0
split           0
dtype: int64

# Step-6: Compute the similarity between two job descriptions before finetuning SBERT:

In [None]:
def cosine_similarity_SBERT(row):
  sentence1 = row.sentence1
  sentence2 = row.sentence2
  embedding1 = model.encode([sentence1])
  embedding2 = model.encode([sentence2])

  #Compute cosine similarity between all pairs
  return round(float(util.cos_sim(embedding1, embedding2)[0][0]), 2) 

In [None]:
df1['SBert_sim'] = df1.apply(cosine_similarity_SBERT, axis=1)
df1.head()

Unnamed: 0,Pair_ID,ClassSpecID1,ClassSpecID2,Class_Title1,Class_Title2,EmployerID1,EmployerID2,Same_emp_ind,BLS_Detail,sentence1,sentence2,score,split,SBert_sim
72,885-76219,885,76219,NURSE PRACTITIONER III,Nurse Practitioner I,9,421,0,Nurse Practitioners,A Bachelor's degree from a recognized college ...,Candidates will be required to obtain a Drug D...,-1.0,train,0.68
180,716754-772074,716754,772074,Nurse Practitioner II,Psychiatric Nurse Practitioner,1597,1470,0,Nurse Practitioners,An employee in this class is responsible for n...,Completion of an accredited program required t...,-1.0,train,0.67
6,3308-1149611,3308,1149611,Nurse Practitioner,Supervising Nurse Practitioner,168,385,0,Nurse Practitioners,A valid Registered Nurse certificate issued by...,A typical way to qualify is three years of pra...,-1.0,train,0.62
219,915521-885,915521,885,Nurse Practitioner - OB,NURSE PRACTITIONER III,168,9,0,Nurse Practitioners,A valid Registered Nurse license issued by the...,A Bachelor's degree from a recognized college ...,-1.0,train,0.79
37,76227-4916,76227,4916,Nurse Practitioner II,Family Nurse Practitioner,421,141,0,Nurse Practitioners,Candidates will be required to obtain a Drug D...,"As directed, evaluates nursing services, recom...",-1.0,train,0.58


# Step-7: Finetune the model

In [None]:
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

Iteration:   0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
help(EmbeddingSimilarityEvaluator.from_input_examples)

Help on method from_input_examples in module sentence_transformers.evaluation.EmbeddingSimilarityEvaluator:

from_input_examples(examples: List[sentence_transformers.readers.InputExample.InputExample], **kwargs) method of builtins.type instance



# Step-8: Load the stored trained model and evaluate its performance on STS benchmark dataset

In [None]:
model = SentenceTransformer(model_save_path)
# model = SentenceTransformer(model_save_path, modules = Union[Iterable[torch.nn.modules.module.Module], NoneType] = None, device: Union[str, NoneType] = None, cache_folder: Union[str, NoneType] = None, use_auth_token: Union[bool, str, NoneType] = None)  

In [None]:
# help(SentenceTransformer)
# SentenceTransformer
# help(sentence_transformers)
model_save_path

'output/training_stsbenchmark_nli-distilroberta-base-v2-2022-07-28_18-14-13'

In [None]:
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

0.5440981125653807

In [None]:
train_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(train_samples, name='sts-train')
# train_evaluator(model, output_path=model_save_path
# print(type(train_evaluator))
# print(model.evaluate(train_evaluator))

In [None]:
help(EmbeddingSimilarityEvaluator.__init__)
# print(EmbeddingSimilarityEvaluator.__dict__)

Help on function __init__ in module sentence_transformers.evaluation.EmbeddingSimilarityEvaluator:

__init__(self, sentences1: List[str], sentences2: List[str], scores: List[float], batch_size: int = 16, main_similarity: sentence_transformers.evaluation.SimilarityFunction.SimilarityFunction = None, name: str = '', show_progress_bar: bool = False, write_csv: bool = True)
    Constructs an evaluator based for the dataset
    
    The labels need to indicate the similarity between the sentences.
    
    :param sentences1:  List with the first sentence in a pair
    :param sentences2: List with the second sentence in a pair
    :param scores: Similarity score between sentences1[i] and sentences2[i]
    :param write_csv: Write results to a CSV file



In [None]:
# help(model)
# help(EmbeddingSimilarityEvaluator)|

In [None]:
train_evaluator.from_input_examples(train_samples)

<sentence_transformers.evaluation.EmbeddingSimilarityEvaluator.EmbeddingSimilarityEvaluator at 0x7f041427bd50>

#Step-9: Compute the similarity between two job descriptions based on finetuned SBERT:

In [None]:
def cosine_similarity_finetuned_SBERT(row):
  sentence1 = row.sentence1
  sentence2 = row.sentence2
  embedding1 = model.encode([sentence1])
  embedding2 = model.encode([sentence2])

  #Compute cosine similarity between all pairs
  return round(float(util.cos_sim(embedding1, embedding2)[0][0]), 2) 

In [None]:
df1['SBert_ft_sim'] = df1.apply(cosine_similarity_finetuned_SBERT, axis=1)
df1.to_csv('final_GT.csv',index=False)
df1.head()

Unnamed: 0,Pair_ID,ClassSpecID1,ClassSpecID2,Class_Title1,Class_Title2,EmployerID1,EmployerID2,Same_emp_ind,BLS_Detail,sentence1,sentence2,score,split,SBert_sim,SBert_ft_sim
72,885-76219,885,76219,NURSE PRACTITIONER III,Nurse Practitioner I,9,421,0,Nurse Practitioners,A Bachelor's degree from a recognized college ...,Candidates will be required to obtain a Drug D...,-1.0,train,0.68,-0.25
180,716754-772074,716754,772074,Nurse Practitioner II,Psychiatric Nurse Practitioner,1597,1470,0,Nurse Practitioners,An employee in this class is responsible for n...,Completion of an accredited program required t...,-1.0,train,0.67,-0.28
6,3308-1149611,3308,1149611,Nurse Practitioner,Supervising Nurse Practitioner,168,385,0,Nurse Practitioners,A valid Registered Nurse certificate issued by...,A typical way to qualify is three years of pra...,-1.0,train,0.62,-0.25
219,915521-885,915521,885,Nurse Practitioner - OB,NURSE PRACTITIONER III,168,9,0,Nurse Practitioners,A valid Registered Nurse license issued by the...,A Bachelor's degree from a recognized college ...,-1.0,train,0.79,-0.31
37,76227-4916,76227,4916,Nurse Practitioner II,Family Nurse Practitioner,421,141,0,Nurse Practitioners,Candidates will be required to obtain a Drug D...,"As directed, evaluates nursing services, recom...",-1.0,train,0.58,-0.1


In [None]:
log_loss = sklearn.metrics.log_loss(y_true = df1['score'], y_pred = df1['SBert_ft_sim'])
print(log_loss)

0.7218464098075241
