## Bias detection system using contextual embedding + similarity

In [1]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [10]:
# Example bias terms and job description, should be replaced
bias_terms = ["dominant", "aggressive", "rockstar", "ninja", "youthful", "digital native", "natural leader"]
job_descriptions = [
    "We're looking for a digital native to join our fast-paced environment.",
    "The ideal candidate will be a strong leader with excellent communication skills.", 
    "We want a youthful person",
    "A person who is kind and easy going and a female",
    "A person who is aggressive in their leadership style",
    "We are looking for a simple developer who can work independently.",
]

In [4]:
# Importing pre-trained model to turn the sentences into numeric vectors (embeddings), 
# in order to capture the semantic meaning of the sentences
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Import model that splits paragraphs into sentences
import nltk
#nltk.download('punkt')
#nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

In [26]:
# Embed bias words/phrases
bias_embeddings = model.encode(bias_terms, convert_to_tensor=True)

# Loop through job descriptions and embed all sentences
all_sentences = []
for jd in job_descriptions:
    all_sentences.extend(sent_tokenize(jd))
print(all_sentences)

sentence_embeddings = model.encode(all_sentences, convert_to_tensor=True)

["We're looking for a digital native to join our fast-paced environment.", 'The ideal candidate will be a strong leader with excellent communication skills.']


In [27]:
# Compute cosine similarity in a matrix form with shape (n_sentences, n_bias_terms)
# Each value is the cosine similarity between a sentence and a bias term.
# Closer to 1 means more similar; closer to 0 means not similar.
cosine_scores = util.cos_sim(sentence_embeddings, bias_embeddings)

In [30]:
import torch

# Set a similarity threshold (tune this based on validation or manual inspection)
threshold = 0.4

# Find matching sentences
for i, sentence in enumerate(all_sentences):
    max_score = torch.max(cosine_scores[i])
    if max_score > threshold:
        # Find which bias term(s) matched
        matched_indices = (cosine_scores[i] > threshold).nonzero(as_tuple=True)[0]
        matched_terms = [bias_terms[j] for j in matched_indices]
        print(f"\n Potential bias in: \"{sentence}\"")
        print(f"→ Matched terms: {matched_terms} (score: {max_score:.2f})")


 Potential bias in: "We're looking for a digital native to join our fast-paced environment."
→ Matched terms: ['digital native'] (score: 0.68)

 Potential bias in: "The ideal candidate will be a strong leader with excellent communication skills."
→ Matched terms: ['natural leader'] (score: 0.48)


### How to extend/improve model:
* Expand or paraphrase bias terms list
* Use sentence embeddings of whole topics (create representative phrases or short sentences for each bias cluster)
* Lower similarity threshold (increases risk of false positives)

## Sentence-level Bias Classifier
Now that we have found the sentences in job descriptions that are labeled as biased based on the bias-terms generated from the topic modelling, it is now relevant to build a classifier that can recognize biased language use in job descriptions, even if specific bias-terms or synonyms of these do not appear in the text directly. 

### Note:
We need a large dataset in order to use the classifier. 

In [11]:
import pandas as pd
# Step 1: Sentence tokenize
sentences = []
labels = []

for jd in job_descriptions:
    for sentence in sent_tokenize(jd):
        sentences.append(sentence)
        # Weak labeling: check if any bias term is in the sentence
        if any(bias_word in sentence.lower() for bias_word in bias_terms):
            labels.append(1)
        else:
            labels.append(0)

df = pd.DataFrame({'sentence': sentences, 'label': labels})
print(df)

                                            sentence  label
0  We're looking for a digital native to join our...      1
1  The ideal candidate will be a strong leader wi...      0
2                          We want a youthful person      1
3   A person who is kind and easy going and a female      0
4  A person who is aggressive in their leadership...      1
5  We are looking for a simple developer who can ...      0


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(df['sentence'])
y = df['label']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#Cross-validation of the classifier
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5, scoring='f1')
print("F1 scores:", scores)
print("Mean F1:", scores.mean())