In [99]:
import genderdecoder3
import re
import pandas as pd
from collections import Counter
from genderdecoder3 import assess
from sklearn.feature_extraction.text import TfidfVectorizer
from deep_translator import GoogleTranslator
import matplotlib.pyplot as plt
import re

In [100]:
df = pd.read_csv("labelled_cleaned.csv")
df.head()

Unnamed: 0,company_name,job_description
0,ministerie van financi n: belastingdienst,tmap next foundation ervaring met projecten in...
1,belastingdienst,"- gecertificeerd scrum master, minimaal psm ii..."
2,duo,3 jaar ervaring in het volgende eisenpakket:\n...
3,dienst uitvoering onderwijs (duo),5 jaar ervaring met de volgende eisen:\n relev...
4,asml netherlands b.v.,location: veldhoven\nnone\n company info: \n ...


In [101]:
feminine_coded_stems_en = [
	"agree", "affectionate", "child", "cheer", "collab", "commit", "communal",
	"compassion", "connect", "considerate", "cooperat", "co-operat", "depend",
	"emotiona", "empath", "feel", "flatterable", "gentle", "honest",
	"interpersonal", "interdependen", "interpersona", "inter-personal", "inter-dependen",
	"inter-persona", "kind", "kinship", "loyal", "modesty", "nag", "nurtur",
	"pleasant", "polite", "quiet", "respon", "sensitiv", "submissive", "support",
	"sympath", "tender", "together", "trust", "understand", "warm", "whin",
	"enthusias", "inclusive", "yield", "share", "sharin"
]
masculine_coded_stems_en = [
	"active", "adventurous", "aggress", "ambitio", "analy", "assert", "athlet",
	"autonom", "battle", "boast", "challeng", "champion", "compet", "confident",
	"courag", "decid", "decision", "decisive", "defend", "determin", "domina",
	"dominant", "driven", "fearless", "fight", "force", "greedy", "head-strong",
	"headstrong", "hierarch", "hostil", "impulsive", "independen", "individual",
	"intellect", "lead", "logic", "objective", "opinion", "outspoken", "persist",
	"principle", "reckless", "self-confiden", "self-relian", "self-sufficien",
	"selfconfiden", "selfrelian", "selfsufficien", "stubborn", "superior", "unreasonab"
]

In [102]:
# Translate to Dutch
def translate_stems(stems, source="en", target="nl"):
	translator = GoogleTranslator(source=source, target=target)
	translations = []
	for stem in stems:
		try:
			translations.append(translator.translate(stem))
		except Exception:
			translations.append(stem)
	return translations

feminine_coded_stems_nl = translate_stems(feminine_coded_stems_en)
masculine_coded_stems_nl = translate_stems(masculine_coded_stems_en)

feminine_coded_stems = feminine_coded_stems_en + feminine_coded_stems_nl
masculine_coded_stems = masculine_coded_stems_en + masculine_coded_stems_nl

for stem in feminine_coded_stems:
    print(stem)


agree
affectionate
child
cheer
collab
commit
communal
compassion
connect
considerate
cooperat
co-operat
depend
emotiona
empath
feel
flatterable
gentle
honest
interpersonal
interdependen
interpersona
inter-personal
inter-dependen
inter-persona
kind
kinship
loyal
modesty
nag
nurtur
pleasant
polite
quiet
respon
sensitiv
submissive
support
sympath
tender
together
trust
understand
warm
whin
enthusias
inclusive
yield
share
sharin
mee eens zijn
geliefd
kind
aanmoedigen
samenwerken
verbinden
gemeenschappelijk
medeleven
verbinden
attent
Cooperat
coöperat
afhankelijk zijn
Emota
empathie
gevoel
platterbaar
teder
eerlijk
interpersoonlijk
onderlinge afhankelijke
interpersona
interpersoonlijk
tussenaf
inter-Persona
vriendelijk
verwantschap
loyaal
bescheidenheid
zeuren
koesteren
prettig
beleefd
rustig
respon
gevoelig
onderdanig
steun
sympathie
teder
samen
vertrouwen
begrijpen
warm
jochie
enthousiasme
inclusief
opbrengst
deel
Sharin


In [103]:
def count_partial_matches(tokens, stem_list):
	return sum(any(stem in token for stem in stem_list) for token in tokens)

In [104]:
def analyze_bias(df, text_column="job_description"):
	fem_counts, masc_counts, bias_labels, bias_scores = [], [], [], []
	for text in df[text_column].astype(str):
		tokens = re.findall(r'\b\w+\b', text.lower())
		fem_count = count_partial_matches(tokens, feminine_coded_stems)
		masc_count = count_partial_matches(tokens, masculine_coded_stems)
		bias_score = masc_count - fem_count

		if bias_score > 0:
			label = "Masculine biased"
		elif bias_score < 0:
			label = "Feminine biased"
		else:
			label = "Neutral"

		fem_counts.append(fem_count)
		masc_counts.append(masc_count)
		bias_scores.append(bias_score)
		bias_labels.append(label)

	df["feminine_word_count"] = fem_counts
	df["masculine_word_count"] = masc_counts
	df["bias_score"] = bias_scores
	df["bias_label"] = bias_labels
	return df

df = analyze_bias(df)

In [105]:
# TF-IDF bias score calculation
def tfidf_bias_score_sparse(tfidf_vector, feature_names, masc_stems, fem_stems):
	indices = tfidf_vector.nonzero()[1]
	masc_score, fem_score = 0, 0
	for idx in indices:
		word = feature_names[idx]
		score = tfidf_vector[0, idx]
		if any(stem in word for stem in masc_stems):
			masc_score += score
		if any(stem in word for stem in fem_stems):
			fem_score += score
	return masc_score - fem_score, masc_score, fem_score

In [106]:
# Calculate TF-IDF scores
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['job_description'].astype(str))
feature_names = vectorizer.get_feature_names_out()

scores = [tfidf_bias_score_sparse(tfidf_matrix[i], feature_names,
								masculine_coded_stems, feminine_coded_stems)
		for i in range(tfidf_matrix.shape[0])]

df[['tfidf_bias_score', 'tfidf_masculine_score', 'tfidf_feminine_score']] = scores

In [None]:
# Decode per job description (genderdecoder3)
results = [assess(text) for text in df['job_description'].astype(str)]
df['decoder_bias_label'] = [res['result'] for res in results]

In [113]:
print(results[2])

{'result': 'strongly masculine-coded', 'explanation': 'This job ad uses more words that are stereotypically masculine than words that are stereotypically feminine. It risks putting women off applying, but will probably encourage men to apply.', 'masculine_coded_words': ['competenties'], 'feminine_coded_words': []}


In [108]:
df.head()

Unnamed: 0,company_name,job_description,feminine_word_count,masculine_word_count,bias_score,bias_label,tfidf_bias_score,tfidf_masculine_score,tfidf_feminine_score,decoder_bias_label
0,ministerie van financi n: belastingdienst,tmap next foundation ervaring met projecten in...,1,1,0,Neutral,-0.043281,0.045057,0.088338,neutral
1,belastingdienst,"- gecertificeerd scrum master, minimaal psm ii...",3,1,-2,Feminine biased,-0.118691,0.043158,0.161849,neutral
2,duo,3 jaar ervaring in het volgende eisenpakket:\n...,1,1,0,Neutral,-0.068705,0.054869,0.123574,strongly masculine-coded
3,dienst uitvoering onderwijs (duo),5 jaar ervaring met de volgende eisen:\n relev...,3,4,1,Masculine biased,0.019232,0.108772,0.08954,strongly masculine-coded
4,asml netherlands b.v.,location: veldhoven\nnone\n company info: \n ...,0,0,0,Neutral,0.0,0.0,0.0,neutral


In [114]:
print(results[0])

{'result': 'neutral', 'explanation': "This job ad doesn't use any words that are stereotypically masculine and stereotypically feminine. It probably won't be off-putting to men or women applicants.", 'masculine_coded_words': [], 'feminine_coded_words': []}
