In [1]:
#pip install genderdecoder3

In [2]:
import genderdecoder3
import re
import pandas as pd
from collections import Counter
from genderdecoder3 import assess
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
real_df = pd.read_csv("labelled_cleaned.csv")

In [4]:
real_df.head()

Unnamed: 0,company_name,job_description
0,ministerie van financi n: belastingdienst,tmap next foundation ervaring met projecten in...
1,belastingdienst,"- gecertificeerd scrum master, minimaal psm ii..."
2,duo,3 jaar ervaring in het volgende eisenpakket:\n...
3,dienst uitvoering onderwijs (duo),5 jaar ervaring met de volgende eisen:\n relev...
4,asml netherlands b.v.,location: veldhoven\nnone\n company info: \n ...


In [5]:
feminine_coded_words = [
    "agree", "affectionate", "child", "cheer", "collab", "commit", "communal",
    "compassion", "connect", "considerate", "cooperat", "co-operat", "depend",
    "emotiona", "empath", "feel", "flatterable", "gentle", "honest",
    "interpersonal", "interdependen", "interpersona", "inter-personal", "inter-dependen",
    "inter-persona", "kind", "kinship", "loyal", "modesty", "nag", "nurtur",
    "pleasant", "polite", "quiet", "respon", "sensitiv", "submissive", "support",
    "sympath", "tender", "together", "trust", "understand", "warm", "whin",
    "enthusias", "inclusive", "yield", "share", "sharin"
]

masculine_coded_words = [
    "active", "adventurous", "aggress", "ambitio", "analy", "assert", "athlet",
    "autonom", "battle", "boast", "challeng", "champion", "compet", "confident",
    "courag", "decid", "decision", "decisive", "defend", "determin", "domina",
    "dominant", "driven", "fearless", "fight", "force", "greedy", "head-strong",
    "headstrong", "hierarch", "hostil", "impulsive", "independen", "individual",
    "intellect", "lead", "logic", "objective", "opinion", "outspoken", "persist",
    "principle", "reckless", "self-confiden", "self-relian", "self-sufficien",
    "selfconfiden", "selfrelian", "selfsufficien", "stubborn", "superior", "unreasonab"
]

In [6]:
from deep_translator import GoogleTranslator
import re

def translate_stems(stems):
	translator = GoogleTranslator(source="en", target="nl")
	translated = []
	for stem in stems:
		try:
			translated.append(translator.translate(stem))
		except:
			translated.append(stem)
	return translated

translated_feminine = translate_stems(feminine_coded_words)
translated_masculine = translate_stems(masculine_coded_words)

In [7]:
masculine_coded_words.extend(translated_masculine)
feminine_coded_words.extend(translated_feminine)

In [8]:
for word in feminine_coded_words:
	print(f'{word}\n')

agree

affectionate

child

cheer

collab

commit

communal

compassion

connect

considerate

cooperat

co-operat

depend

emotiona

empath

feel

flatterable

gentle

honest

interpersonal

interdependen

interpersona

inter-personal

inter-dependen

inter-persona

kind

kinship

loyal

modesty

nag

nurtur

pleasant

polite

quiet

respon

sensitiv

submissive

support

sympath

tender

together

trust

understand

warm

whin

enthusias

inclusive

yield

share

sharin

mee eens zijn

geliefd

kind

aanmoedigen

samenwerken

verbinden

gemeenschappelijk

medeleven

verbinden

attent

Cooperat

coöperat

afhankelijk zijn

Emota

empathie

gevoel

platterbaar

teder

eerlijk

interpersoonlijk

onderlinge afhankelijke

interpersona

interpersoonlijk

tussenaf

inter-Persona

vriendelijk

verwantschap

loyaal

bescheidenheid

zeuren

koesteren

prettig

beleefd

rustig

respon

gevoelig

onderdanig

steun

sympathie

teder

samen

vertrouwen

begrijpen

warm

jochie

enthousiasme

inclu

In [9]:
job_descriptions = real_df['job_description'].astype(str).tolist()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(job_descriptions)
feature_names = vectorizer.get_feature_names_out()

def tfidf_bias_score(tfidf_vector, feature_names, masc_words, fem_words):
	tfidf_scores = dict(zip(feature_names, tfidf_vector.toarray()[0]))
	masc_score = sum(tfidf_scores.get(word, 0) for word in masc_words)
	fem_score = sum(tfidf_scores.get(word, 0) for word in fem_words)
	return masc_score - fem_score

bias_scores = []
masc_words = []
fem_words = []

def collect_partial_matches(tokens, stem_list):
	return [token for token in tokens for stem in stem_list if stem in token]

def classify_coding(masc_words, fem_words):
	if len(masc_words) > len(fem_words):
		return "masculine-coded"
	elif len(fem_words) > len(masc_words):
		return "feminine-coded"
	else:
		return "neutral"

for idx, row in real_df.iterrows():
	job_description_nl = str(row["job_description"])
	tokens = re.findall(r'\b\w+\b', job_description_nl.lower())

	fem_words = collect_partial_matches(tokens, feminine_coded_words)
	masc_words = collect_partial_matches(tokens, masculine_coded_words)
	bias_score = tfidf_bias_score(tfidf_matrix[idx], feature_names, masc_words, fem_words)

	real_df.at[idx, "feminine_word_count"] = len(fem_words)
	real_df.at[idx, "masculine_word_count"] = len(masc_words)
	real_df.at[idx, "score"] = bias_score
	real_df.at[idx, "gender_label"] = classify_coding(masc_words, fem_words)


In [10]:
real_df.head()

Unnamed: 0,company_name,job_description,feminine_word_count,masculine_word_count,score,gender_label
0,ministerie van financi n: belastingdienst,tmap next foundation ervaring met projecten in...,1.0,1.0,-0.043281,neutral
1,belastingdienst,"- gecertificeerd scrum master, minimaal psm ii...",3.0,1.0,-0.118691,feminine-coded
2,duo,3 jaar ervaring in het volgende eisenpakket:\n...,1.0,1.0,-0.068705,neutral
3,dienst uitvoering onderwijs (duo),5 jaar ervaring met de volgende eisen:\n relev...,4.0,5.0,0.067618,masculine-coded
4,asml netherlands b.v.,location: veldhoven\nnone\n company info: \n ...,0.0,0.0,0.0,neutral
