code/gender_associations_hypotest.py

#!/usr/bin/env python

"""Conducts hypotheisis tests based on cluster-based gender associations (from gender_associations.py)
For each cluster where there n_f female-associated words and n_m male-associated words,
take n = min(n_f, n_m) most female- and male-associated words (or a maximum of nwords specified by user) and those become the X and Y target word lists.
The n most female- and male-associated words are those that score highest in the Caliskan et al (2017) association score (means difference)
on female and male attribute lists, respectively.
A WEAT hypothesis test is conducted based on the attribute words and the p-value is reported for the cluster.
Produces a file with a p-value for each cluster. It's a tab-separated file with the following contents:
CLUSTER    MAJ_ASSOC    CLUSTER_DESC    X_WORDS    Y_WORDS    P-VALUE    COHENS_D

where:
CLUSTER is cluster number.
MAJ_ASSOC majority associated gender in cluster (as per gender_associations.py) -- ie the gender with most associated words in gender_associations.py
CLUSTER_DESC is a 'description' of the cluster based on the 10 words nearest to the cluster centroid.
M_WORDS the n male-associated target words selected for the test
F_WORDS the n female-associated target words selected for the test
P-VALUE is the hypothesis p-value (probability the Ho is true -- that there is no difference between M_WORDS and F_WORDS in their bias towards male of female terms)
COHENS_D is the Cohen's d statistic, i.e. the effect size (Caliskan et al: difference between two means divided by the standard deviation. "Conventional small, medium and large values of d are 0.2, 0.5 and 0.8, respectively".)
"""

import argparse
import gensim
import numpy as np
import gender_associations as ga
import random
import math


def parse_args():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-v', '--vectors', type=str, help="File containing Word Embeddings")
    parser.add_argument('-t', '--type', choices=['w2v', 'ft'], default='w2v',
                        help='Type of Word Embedding: w2v: word2vec, ft: fasttext')
    parser.add_argument('-g', '--gassocs', type=str, help="Gender associations per cluster (generated by gender_associations.py)")
    parser.add_argument('-f', '--fattr', type=str, help="Female Attribute words list, one word per line")
    parser.add_argument('-m', '--mattr', type=str, help="Male Attribute words list, one word per line")
    parser.add_argument('-i', '--iter', type=int, default=1000, help="Number of iterations on each randomisation test")
    parser.add_argument('-n', '--nwords', type=int, default=-1, help="Maximum number of target words to use per list (male/female) on each randomisation test. If -1 (default), n = min(n_f, n_m)  in cluster are used")
    parser.add_argument('-p', '--pretest', action='store_true',
                        help="Pre-hypothesis test? If set, hypothesis test is NOT conducted -- only word selection is done.")
    parser.add_argument('-o', '--out', type=str, help="Output to save results of tests (one line per test)")
    return parser.parse_args()


def main(args):
    if args.type == 'w2v':
        wv = gensim.models.KeyedVectors.load_word2vec_format(args.vectors, binary=True, unicode_errors='ignore')
    elif args.type == 'ft':
        wv = gensim.models.fasttext.load_facebook_vectors(args.vectors)
    else:
        raise ValueError("Unsupported Word Embedding type '{}'".format(args.type))

    female_attrs = ga.filter_words(wv, ga.load_attrs(args.fattr))
    male_attrs = ga.filter_words(wv, ga.load_attrs(args.mattr))
    last_cluster = None
    f_words = []
    f_scores = []
    m_words = []
    m_scores = []
    genders = {'F': 0, 'M': 0}
    with open(args.gassocs, 'r') as fga, open(args.out, 'w') as fout:
        for lgassoc in fga:
            fields = lgassoc.strip().split('\t')
            cluster = int(fields[0])
            cluster_desc = fields[1]
            word = fields[2]
            gender_score = float(fields[4])
            if gender_score < 0:
                f_words.append(word)
                f_scores.append(gender_score * -1)  # we convert it to a positive value
                genders['F'] += 1
            else:
                m_words.append(word)
                m_scores.append(gender_score)
                genders['M'] += 1
            if last_cluster != cluster and last_cluster is not None:
                m_words, f_words = choose_words(m_words, f_words, m_scores, f_scores, args.nwords)
                p_value = weat_rand_test(wv, m_words, f_words, male_attrs, female_attrs, args.iter) if not args.pretest else "?"
                cohens_d = get_cohens_d(wv,  m_words, f_words, male_attrs, female_attrs)
                print_line(fout, cluster, maj_gender(genders), cluster_desc, m_words, f_words, p_value, cohens_d)
                f_words = []
                m_words = []
                f_scores = []
                m_scores = []
                genders = {'F': 0, 'M': 0}
            last_cluster = cluster
        m_words, f_words = choose_words(m_words, f_words, m_scores, f_scores, args.nwords)
        p_value = weat_rand_test(wv, m_words, f_words, male_attrs, female_attrs, args.iter) if not args.pretest else "?"
        cohens_d = get_cohens_d(wv,  m_words, f_words, male_attrs, female_attrs)
        print_line(fout, cluster, maj_gender(genders), cluster_desc, m_words, f_words, p_value, cohens_d)
    print("Done!")


def maj_gender(genders):
    return 'F' if genders['F'] > genders['M'] else '=' if genders['M'] == genders['F'] else 'M'


def print_line(f, cluster, majority_gender, cluster_desc, m_words, f_words, p_value, cohens_d):
    f.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(cluster, majority_gender, cluster_desc, ",".join(m_words),
                                                  ",".join(f_words), p_value, cohens_d))


def choose_words(m_words, f_words, m_scores, f_scores, nwords=-1):
    n = np.min([len(m_words), len(f_words)] + ([nwords] if nwords >= 0 else []))
    m_scores = np.array(m_scores)
    f_scores = np.array(f_scores)
    m_topn_indices = m_scores.argsort()[::-1][:n]
    f_topn_indices = f_scores.argsort()[::-1][:n]
    sel_m_words = [m_words[i] for i in m_topn_indices]
    sel_f_words = [f_words[i] for i in f_topn_indices]
    return sel_m_words, sel_f_words


def weat_rand_test(wv, m_words, f_words, m_attrs, f_attrs, iterations):
    u_words = m_words + f_words
    runs = np.min((iterations, math.factorial(len(u_words))))
    seen = set()

    original = test_statistic(wv, m_words, f_words, m_attrs, f_attrs)
    r = 0
    for _ in range(runs):
        permutation = tuple(random.sample(u_words, len(u_words)))
        if permutation not in seen:
            m_hat = permutation[0:len(m_words)]
            f_hat = permutation[len(f_words):]
            if test_statistic(wv, m_hat, f_hat, m_attrs, f_attrs) > original:
                r += 1
            seen.add(permutation)
    p_value = r / runs
    return p_value


def get_cohens_d(wv,  m_targets, f_targets, m_attrs, f_attrs):
    if len(m_targets) == 0 or len(f_targets) == 0:
        return "NA"
    m_sum, f_sum = test_sums(wv, m_targets, f_targets, m_attrs, f_attrs)
    m_mean = m_sum / len(m_targets)
    f_mean = f_sum / len(f_targets)
    m_u_f = np.array([ga.cosine_means_difference(wv, w, m_attrs, f_attrs) for w in m_targets + f_targets])
    stdev = m_u_f.std(ddof=1)
    return (m_mean - f_mean) / stdev


def test_statistic(wv, m_targets, f_targets, m_attrs, f_attrs):
    m_sum, f_sum = test_sums(wv, m_targets, f_targets, m_attrs, f_attrs)
    return m_sum - f_sum


def test_sums(wv, m_targets, f_targets, m_attrs, f_attrs):
    m_sum = 0.0
    f_sum = 0.0
    for t in m_targets:
        m_sum += ga.cosine_means_difference(wv, t, m_attrs, f_attrs)
    for t in f_targets:
        f_sum += ga.cosine_means_difference(wv, t, m_attrs, f_attrs)
    return m_sum, f_sum


if __name__ == '__main__':
    main(parse_args())