-
Notifications
You must be signed in to change notification settings - Fork 3
/
gender_associations_hypotest.py
executable file
·161 lines (136 loc) · 7.8 KB
/
gender_associations_hypotest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python
"""Conducts hypotheisis tests based on cluster-based gender associations (from gender_associations.py)
For each cluster where there n_f female-associated words and n_m male-associated words,
take n = min(n_f, n_m) most female- and male-associated words (or a maximum of nwords specified by user) and those become the X and Y target word lists.
The n most female- and male-associated words are those that score highest in the Caliskan et al (2017) association score (means difference)
on female and male attribute lists, respectively.
A WEAT hypothesis test is conducted based on the attribute words and the p-value is reported for the cluster.
Produces a file with a p-value for each cluster. It's a tab-separated file with the following contents:
CLUSTER MAJ_ASSOC CLUSTER_DESC X_WORDS Y_WORDS P-VALUE COHENS_D
where:
CLUSTER is cluster number.
MAJ_ASSOC majority associated gender in cluster (as per gender_associations.py) -- ie the gender with most associated words in gender_associations.py
CLUSTER_DESC is a 'description' of the cluster based on the 10 words nearest to the cluster centroid.
M_WORDS the n male-associated target words selected for the test
F_WORDS the n female-associated target words selected for the test
P-VALUE is the hypothesis p-value (probability the Ho is true -- that there is no difference between M_WORDS and F_WORDS in their bias towards male of female terms)
COHENS_D is the Cohen's d statistic, i.e. the effect size (Caliskan et al: difference between two means divided by the standard deviation. "Conventional small, medium and large values of d are 0.2, 0.5 and 0.8, respectively".)
"""
import argparse
import gensim
import numpy as np
import gender_associations as ga
import random
import math
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-v', '--vectors', type=str, help="File containing Word Embeddings")
parser.add_argument('-t', '--type', choices=['w2v', 'ft'], default='w2v',
help='Type of Word Embedding: w2v: word2vec, ft: fasttext')
parser.add_argument('-g', '--gassocs', type=str, help="Gender associations per cluster (generated by gender_associations.py)")
parser.add_argument('-f', '--fattr', type=str, help="Female Attribute words list, one word per line")
parser.add_argument('-m', '--mattr', type=str, help="Male Attribute words list, one word per line")
parser.add_argument('-i', '--iter', type=int, default=1000, help="Number of iterations on each randomisation test")
parser.add_argument('-n', '--nwords', type=int, default=-1, help="Maximum number of target words to use per list (male/female) on each randomisation test. If -1 (default), n = min(n_f, n_m) in cluster are used")
parser.add_argument('-p', '--pretest', action='store_true',
help="Pre-hypothesis test? If set, hypothesis test is NOT conducted -- only word selection is done.")
parser.add_argument('-o', '--out', type=str, help="Output to save results of tests (one line per test)")
return parser.parse_args()
def main(args):
if args.type == 'w2v':
wv = gensim.models.KeyedVectors.load_word2vec_format(args.vectors, binary=True, unicode_errors='ignore')
elif args.type == 'ft':
wv = gensim.models.fasttext.load_facebook_vectors(args.vectors)
else:
raise ValueError("Unsupported Word Embedding type '{}'".format(args.type))
female_attrs = ga.filter_words(wv, ga.load_attrs(args.fattr))
male_attrs = ga.filter_words(wv, ga.load_attrs(args.mattr))
last_cluster = None
f_words = []
f_scores = []
m_words = []
m_scores = []
genders = {'F': 0, 'M': 0}
with open(args.gassocs, 'r') as fga, open(args.out, 'w') as fout:
for lgassoc in fga:
fields = lgassoc.strip().split('\t')
cluster = int(fields[0])
cluster_desc = fields[1]
word = fields[2]
gender_score = float(fields[4])
if gender_score < 0:
f_words.append(word)
f_scores.append(gender_score * -1) # we convert it to a positive value
genders['F'] += 1
else:
m_words.append(word)
m_scores.append(gender_score)
genders['M'] += 1
if last_cluster != cluster and last_cluster is not None:
m_words, f_words = choose_words(m_words, f_words, m_scores, f_scores, args.nwords)
p_value = weat_rand_test(wv, m_words, f_words, male_attrs, female_attrs, args.iter) if not args.pretest else "?"
cohens_d = get_cohens_d(wv, m_words, f_words, male_attrs, female_attrs)
print_line(fout, cluster, maj_gender(genders), cluster_desc, m_words, f_words, p_value, cohens_d)
f_words = []
m_words = []
f_scores = []
m_scores = []
genders = {'F': 0, 'M': 0}
last_cluster = cluster
m_words, f_words = choose_words(m_words, f_words, m_scores, f_scores, args.nwords)
p_value = weat_rand_test(wv, m_words, f_words, male_attrs, female_attrs, args.iter) if not args.pretest else "?"
cohens_d = get_cohens_d(wv, m_words, f_words, male_attrs, female_attrs)
print_line(fout, cluster, maj_gender(genders), cluster_desc, m_words, f_words, p_value, cohens_d)
print("Done!")
def maj_gender(genders):
return 'F' if genders['F'] > genders['M'] else '=' if genders['M'] == genders['F'] else 'M'
def print_line(f, cluster, majority_gender, cluster_desc, m_words, f_words, p_value, cohens_d):
f.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(cluster, majority_gender, cluster_desc, ",".join(m_words),
",".join(f_words), p_value, cohens_d))
def choose_words(m_words, f_words, m_scores, f_scores, nwords=-1):
n = np.min([len(m_words), len(f_words)] + ([nwords] if nwords >= 0 else []))
m_scores = np.array(m_scores)
f_scores = np.array(f_scores)
m_topn_indices = m_scores.argsort()[::-1][:n]
f_topn_indices = f_scores.argsort()[::-1][:n]
sel_m_words = [m_words[i] for i in m_topn_indices]
sel_f_words = [f_words[i] for i in f_topn_indices]
return sel_m_words, sel_f_words
def weat_rand_test(wv, m_words, f_words, m_attrs, f_attrs, iterations):
u_words = m_words + f_words
runs = np.min((iterations, math.factorial(len(u_words))))
seen = set()
original = test_statistic(wv, m_words, f_words, m_attrs, f_attrs)
r = 0
for _ in range(runs):
permutation = tuple(random.sample(u_words, len(u_words)))
if permutation not in seen:
m_hat = permutation[0:len(m_words)]
f_hat = permutation[len(f_words):]
if test_statistic(wv, m_hat, f_hat, m_attrs, f_attrs) > original:
r += 1
seen.add(permutation)
p_value = r / runs
return p_value
def get_cohens_d(wv, m_targets, f_targets, m_attrs, f_attrs):
if len(m_targets) == 0 or len(f_targets) == 0:
return "NA"
m_sum, f_sum = test_sums(wv, m_targets, f_targets, m_attrs, f_attrs)
m_mean = m_sum / len(m_targets)
f_mean = f_sum / len(f_targets)
m_u_f = np.array([ga.cosine_means_difference(wv, w, m_attrs, f_attrs) for w in m_targets + f_targets])
stdev = m_u_f.std(ddof=1)
return (m_mean - f_mean) / stdev
def test_statistic(wv, m_targets, f_targets, m_attrs, f_attrs):
m_sum, f_sum = test_sums(wv, m_targets, f_targets, m_attrs, f_attrs)
return m_sum - f_sum
def test_sums(wv, m_targets, f_targets, m_attrs, f_attrs):
m_sum = 0.0
f_sum = 0.0
for t in m_targets:
m_sum += ga.cosine_means_difference(wv, t, m_attrs, f_attrs)
for t in f_targets:
f_sum += ga.cosine_means_difference(wv, t, m_attrs, f_attrs)
return m_sum, f_sum
if __name__ == '__main__':
main(parse_args())