In [None]:
import  os
from collections import OrderedDict

import pandas as pd
import numpy as np

from scipy.stats import beta, ks_2samp

from IPython.core.display import display, HTML

from model import load_object
from sentiments import parse_adj_supersense, parse_verb_supersense, parse_vae

# Setup

In [None]:
MODEL_DIR = './models/lexical_pos_all-data/'
dir_jj = f'{MODEL_DIR}/jj'
dir_dobj = f'{MODEL_DIR}/dobj'
dir_nsubj = f'{MODEL_DIR}/nsubj'

DESCENDING_ETA = True
TERMS_PER_TOPIC = 200

L1_REG = 0
KL_REG = 1.0

In [None]:
def read_results(dir):
  grid = os.listdir(dir)
  results = {}
  for fname in grid:
    if 'config_dict.pkl' not in fname and fname.endswith('pkl'):
      r = load_object(os.path.join(dir, fname))
      r['w_vocab'] = r.pop('adj_vocab', r.pop('w_vocab', None))
      results[(r['l1_reg'], r['kl_reg'])] = r
  results = OrderedDict(sorted(results.items(), key=lambda kv: kv[0]))
  return results

In [None]:
def top_n_terms(x, inv_vocab, n=25, vals=False, desc=False):
  out = x.round(1) if vals else inv_vocab
  idx = x.argsort()
  if desc:
    idx = idx[::-1]  
  return [out[i] for i in idx[:n]]

In [None]:
def perm_test_two_sided(x, y, nmc=30000, p_ci=0.99, abs=False):
  '''
  Ported from `perm` package in R
  '''
  w = np.concatenate([x, y])
  z = np.concatenate([np.ones_like(x), np.zeros_like(y)])
  
  t0 = (w * z).sum()
  ti = np.zeros(nmc)
  for j in range(nmc):
    ti[j] = (w * np.random.permutation(z)).sum()
    
  mu = ti.mean()
  ti = ti - mu
  t0 = t0 - mu
  
  s_lte = (ti <= t0).sum()
  s_gte = (ti >= t0).sum()
  s_abs = (np.abs(ti) >= np.abs(t0)).sum()
  
  s = s_abs if abs else min(s_lte, s_gte) 
  r = 1 if abs else 2
  
  # calculate CI
  alpha = 1 - p_ci
  ci = r * np.array([
    beta.ppf(alpha / 2, s, nmc - s + 1),
    beta.ppf(1 - alpha / 2, s + 1, nmc - s)
  ])
  
  p_lte = (s_lte + 1) / (nmc + 1)
  p_gte = (s_gte + 1) / (nmc + 1)
  p_abs = (s_abs + 1) / (nmc + 1)
  p = p_abs if abs else min(1, 2 * min(p_lte, p_gte))
  
  return x.mean() - y.mean(), p, ci


def softmax(x, axis=0):
  e_x = np.exp(x - np.max(x))
  return e_x / e_x.sum(axis=axis, keepdims=True)

### Read Data

In [None]:
results_jj = read_results(dir_jj)
results_dobj = read_results(dir_dobj)
results_nsubj = read_results(dir_nsubj)

senses_adj = parse_adj_supersense('data/sentiments/word_types.predicted')
senses_verb = parse_verb_supersense('data/sentiments/semcor_noun_verb.supersenses.en.txt')

In [None]:
sent_data = pd.read_csv(
    './models/vae_full_primedprior_softmax/sent_dict.csv',
).drop('from_vae', axis=1)
sent_data.columns = ['word', 'pos', 'neu', 'neg']

## Generate Topics

In [None]:
def process_results(results_list, k, desc, exclude_kl=[0], exclude_l1=[0.1, 1.0, 10.0, 100.0]):
  '''
  Process results across models and store in dataframe
  '''
  topic_set = []
  pos, neg, neu = 0, 2, 1

  for (l1, kl), result in results_list.items():
    if l1 in exclude_l1:
      continue
    if kl in exclude_kl:
      continue

    model_result = []
    for fem in (0, 1):
      
      # Hack to accomodate sent-free
      if len(result['eta_fem_sent'].squeeze().shape) == 2:
        result['eta_fem_sent'] = np.tile(result['eta_fem_sent'].squeeze(), [3, 1, 1])
      
      eta = result['eta_fem_sent'].squeeze()[:, fem, :]
      inv_vocab = dict(zip(result['w_vocab'].values(), result['w_vocab'].keys()))
      gender = 'fem' if fem else 'masc'
      model_result.append(pd.DataFrame({
        f'{gender}_pos_terms': top_n_terms(eta[pos, :], inv_vocab, k, desc=desc),
        f'{gender}_pos_values': top_n_terms(eta[pos, :], inv_vocab, k, vals=True, desc=desc),
        f'{gender}_neg_terms': top_n_terms(eta[neg, :], inv_vocab, k, desc=desc),
        f'{gender}_neg_values': top_n_terms(eta[neg, :], inv_vocab, k, vals=True, desc=desc),
        f'{gender}_neu_terms': top_n_terms(eta[neu, :], inv_vocab, k, desc=desc),
        f'{gender}_neu_values': top_n_terms(eta[neu, :], inv_vocab, k, vals=True, desc=desc),
      }))
    model_result = pd.concat(model_result, axis=1)
    model_result['l1'] = l1
    model_result['kl'] = kl
    topic_set.append(model_result)
  
  return pd.concat(topic_set, ignore_index=True, axis=0)

def combine_results(results_list, k, desc, exclude_kl=[0.], exclude_l1=[]):
  '''
  Combine results across all models
  '''
  topic_set = process_results(results_list, k, desc=desc, exclude_kl=exclude_kl, exclude_l1=exclude_l1)
  grouped_topics = []
  
  for gender in ('masc', 'fem'):
    for sent in ('pos', 'neg', 'neu'):
      col_term, col_val, col_count = (
        f'{gender}_{sent}_terms', f'{gender}_{sent}_values', f'{gender}_{sent}_counts'
      )
      
      grouped = (
        topic_set[[col_term, col_val]]
            .loc[topic_set[col_val] > 0] # TODO: Think about excluding zeros; 
            .groupby(col_term, as_index=False)
           .agg(['count', 'mean'])
      )
      grouped.columns = grouped.columns.droplevel(0)
      grouped.columns = [col_count, col_val]
      grouped = (
        grouped.sort_values([col_count, col_val], ascending=False)
               .reset_index()
      )
      grouped_topics.append(grouped[[col_term, col_val]])
      grouped_topics
  grouped_topics = pd.concat(grouped_topics, axis=1)
  return grouped_topics.head(k)

In [None]:
topics_jj = combine_results(results_jj, k=TERMS_PER_TOPIC, desc=DESCENDING_ETA)#, exclude_kl=[])
topics_dobj = combine_results(results_dobj, k=TERMS_PER_TOPIC, desc=DESCENDING_ETA)#, exclude_kl=[])
topics_nsubj = combine_results(results_nsubj, k=TERMS_PER_TOPIC, desc=DESCENDING_ETA)#, exclude_kl=[])

In [None]:
print(topics_jj.head(25).round(1).to_latex(index=False, escape=False))

In [None]:
print(topics_dobj.head(25).round(1).to_latex(index=False, escape=False))

In [None]:
print(topics_nsubj.head(25).round(1).to_latex(index=False, escape=False))

## Evaluate sense differences

In [None]:
def map_topics_to_senses(topics, sense_data):
  '''
  Map the top n eta terms to the sense data for a given result
  '''      
  # TODO: account for sent-free data
  term_sense_data = []  
  for gender in ['masc', 'fem']:
    for sent in ['pos', 'neg', 'neu']:      
      term_senses = map_terms_to_senses(topics[f'{gender}_{sent}_terms'].values, sense_data)
      term_senses['gender'] = gender
      term_senses['sent'] = sent
      term_sense_data.append(term_senses)
      
  return pd.concat(term_sense_data, ignore_index=True)

def map_terms_to_senses(terms, sense_data):
  '''
  Map a set of terms to sense data
  ''' 
  term_senses = (
    pd.DataFrame(terms)
      .merge(sense_data, how='inner', left_on=0, right_on='word')
      .reset_index()
  )
  return term_senses

def topic_sense_perm_test(term_sense_data, sense_types, nmc, abs, p_ci, alpha=0.05):
  '''
  Run the permutation test on the mapped data
  '''
  perm_test_data = []
  for sent in term_sense_data.sent.unique():
    for sense in sense_types:
      masc_values = term_sense_data.loc[
        (term_sense_data.gender == 'masc') & (term_sense_data.sent == sent)
      ][sense].values
      
      fem_values = term_sense_data.loc[
        (term_sense_data.gender == 'fem') & (term_sense_data.sent == sent)
      ][sense].values
      
      diff, p, ci = perm_test_two_sided(masc_values, fem_values, nmc=nmc, p_ci=p_ci, abs=abs)
      sig = '*' if p <= (alpha / len(sense_types)) else '' # bonferonni
      # p, sig, ci = 0, '', 0 # AVOID P-HACKING
      perm_test_data.append([sent, sense, masc_values.mean(), fem_values.mean(), f'{p:0.3f}{sig}', np.round(ci, 3)])
  
  return pd.DataFrame(perm_test_data, columns=['Sentiment', 'Sense', 'Mean Masc', 'Mean Fem', 'p', 'C.I.'])

### Perm Tests

In [None]:
term_sense_data_jj = map_topics_to_senses(
  topics=topics_jj,
  sense_data=senses_adj,
)
term_sense_data_dobj = map_topics_to_senses(
  topics=topics_dobj,
  sense_data=senses_verb,
)
term_sense_data_nsubj = map_topics_to_senses(
  topics=topics_nsubj,
  sense_data=senses_verb,
)

In [None]:
sense_types_adj = [c for c in senses_adj.columns if c not in ['word', 'class']]
sense_types_verb = [c for c in senses_verb.columns if c not in ['word']]

perm_test_data_jj = topic_sense_perm_test(
  term_sense_data_jj,
  sense_types=sense_types_adj,
  nmc=30000,
  abs=False,
  p_ci=0.99
)

perm_test_data_dobj = topic_sense_perm_test(
  term_sense_data_dobj,
  sense_types=sense_types_verb,
  nmc=30000,
  abs=False,
  p_ci=0.99
)

perm_test_data_nsubj = topic_sense_perm_test(
  term_sense_data_nsubj,
  sense_types=sense_types_verb,
  nmc=30000,
  abs=False,
  p_ci=0.99
)

In [None]:
perm_test_data_jj.loc[perm_test_data_jj.p.str.contains(r'\*')]
#print(perm_test_data_jj.to_latex())

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
sig_sense_jj = perm_test_data_jj.loc[perm_test_data_jj.p.str.contains(r'\*')].copy()


sig_sense_jj['label'] = sig_sense_jj.Sentiment.str.upper() + '--' + sig_sense_jj.Sense.str.replace('MISCELLANEOUS', 'MISC')
sig_sense_jj = sig_sense_jj.rename({'Mean Masc': 'Masc', 'Mean Fem': 'Fem'}, axis=1)

# tex fonts
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

ax = sig_sense_jj.plot(x="label", y=["Masc", "Fem"], kind="bar", colormap="Set2")

plt.xlabel("")
plt.xticks(rotation=45, ha='right')
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.legend(fontsize=15)
plt.ylim((0, 0.25))

plt.savefig(f"./jj-sense.pdf", bbox_inches='tight')

In [None]:
perm_test_data_dobj.loc[perm_test_data_dobj.p.str.contains(r'\*')]
# print(perm_test_data_dobj.to_latex())

In [None]:
perm_test_data_nsubj.loc[perm_test_data_nsubj.p.str.contains(r'\*')]
#print(perm_test_data_nsubj.to_latex())

In [None]:
import re
sig_sense_nsubj = perm_test_data_nsubj.loc[perm_test_data_nsubj.p.str.contains(r'\*')].copy()

replacer = re.compile('verb_')
sig_sense_nsubj['label'] = (
  sig_sense_nsubj.Sentiment.str.upper() 
  + '--' 
  + sig_sense_nsubj.Sense.str.replace(replacer, '').replace('communication', 'comm.').str.upper()
)
sig_sense_nsubj = sig_sense_nsubj.rename({'Mean Masc': 'Masc', 'Mean Fem': 'Fem'}, axis=1)

# tex fonts
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

ax = sig_sense_nsubj.plot(x="label", y=["Masc", "Fem"], kind="bar", colormap="Set2")

plt.xlabel("")
plt.xticks(rotation=45, ha='right')
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.legend(fontsize=15)
plt.ylim((0, 0.25))

plt.savefig(f"./nsubj-sense.pdf", bbox_inches='tight')

In [None]:
from itertools import zip_longest
for a, v in zip_longest(sense_types_adj, sense_types_verb):
  v = v.replace("verb_", "").title()
  a = a.title() if a else ''
  print(f'{a} & {v} \\\\')

### Human Annotation Correlation

#### Lexical

In [None]:
eta_fem_sent_mean_jj = np.mean([r['eta_fem_sent'] for r in results_jj.values()], axis=0)
eta_lemma_mean_jj = np.mean([r['eta_lemma'] for r in results_jj.values()], axis=0)
eta_plural_mean_jj = np.mean([r['eta_plural'] for r in results_jj.values()], axis=0)
sigma_mean_jj = np.mean([r['sigma'] for r in results_jj.values()], axis=0)

In [None]:
from model import process_data, load_object, ModelConfig

config_dict = load_object(f'{MODEL_DIR}/config_dict.pkl')
config = ModelConfig(**config_dict)
config.sent_fpath = './models/vae_full_primedprior_softmax/sent_dict.csv'
data, _, _ = process_data(config)

In [None]:
import tensorflow as tf
tf.reset_default_graph()
eta_fem_sent_tf = tf.constant(eta_fem_sent_mean_jj)
eta_lemma_tf = tf.constant(eta_lemma_mean_jj)
eta_plural_tf = tf.constant(eta_plural_mean_jj)
sigma_tf = tf.constant(sigma_mean_jj)

m_tf =  tf.constant(results_jj[(0.0, 0.00001)]['m'])

p_w_given_sent_noun_tf = tf.nn.softmax(m_tf + eta_fem_sent_tf + eta_lemma_tf + eta_plural_tf, axis=-1)
prob_sent_given_gend_tf = tf.nn.softmax(sigma_tf, axis=0)

In [None]:
with tf.Session() as sess:
  p_w_given_sent_noun, prob_sent_given_gend = sess.run([p_w_given_sent_noun_tf, prob_sent_given_gend_tf])

In [None]:
# marginalize out the lexical features
p_w_lex_given_sent = (
  p_w_given_sent_noun 
  * data['lemma_freq'].reshape(1, 1, -1, 1, 1)
  * data['plural_freq'].reshape(1, 1, 1, 2, 1)
)
p_w_given_sent_gend = p_w_lex_given_sent.sum(axis=(2,3))

#### Evaluations

In [None]:
human_eval = pd.read_csv('./data/human-eval/acl.txt', sep=' ', names=['gend', 'word', 'sent'])

vocab = results_jj[(L1_REG, KL_REG)]['w_vocab']
sent_map = {0: 'pos', 2: 'neg', 1: 'neu'}
gend_map = {0: 'masc', 1: 'fem'}

for i, row in human_eval.iterrows():
  try:
    word_idx = vocab[row.word]
  except KeyError:
    continue
  p_w = p_w_given_sent_gend[:, :, word_idx]
  for gend_idx, gend in gend_map.items():
    p_data = p_w[:, gend_idx] * prob_sent_given_gend[:, gend_idx]
    norm = (p_w_given_sent_gend[:, gend_idx, :] * prob_sent_given_gend[:, gend_idx].reshape(3, 1)).max()
    human_eval.at[i, f'eta_{gend}'] = p_data.sum()
    for sent_idx, sent in sent_map.items():
      pass
      #p_data = p_w[sent_idx, gend_idx] * prob_sent_given_gend[sent_idx, gend_idx]
      #human_eval.at[i, f'eta_{gend}_{sent}'] = p_data
      
human_eval = human_eval.dropna()

In [None]:
human_gend_evals = (human_eval.gend == 'fem') * 1
model_gend_evals = (human_eval.eta_fem > human_eval.eta_masc) * 1
diff = (human_eval.eta_fem - human_eval.eta_masc)

In [None]:
from scipy.stats import spearmanr, pearsonr
spearmanr(human_gend_evals, diff)

In [None]:
williamsbest = pd.read_csv('./data/human-eval/adjectives_williamsbest.csv')
williamsbest = williamsbest.groupby('word', as_index=False).mean()

vocab = results_jj[(L1_REG, KL_REG)]['w_vocab']
sent_map = {0: 'pos', 2: 'neg', 1: 'neu'}
gend_map = {0: 'masc', 1: 'fem'}

for i, row in williamsbest.iterrows():
  try:
    word_idx = vocab[row.word.lower()]
  except KeyError:
    continue
  p_w = p_w_given_sent_gend[:, :, word_idx]
  for gend_idx, gend in gend_map.items():
    p_data = p_w[:, gend_idx] * prob_sent_given_gend[:, gend_idx]
    norm = (p_w_given_sent_gend[:, gend_idx, :] * prob_sent_given_gend[:, gend_idx].reshape(3, 1)).max()
    williamsbest.at[i, f'eta_{gend}'] = p_data.sum()
    for sent_idx, sent in sent_map.items():
      pass
      #p_data = p_w[sent_idx, gend_idx] * prob_sent_given_gend[sent_idx, gend_idx]
      #human_eval.at[i, f'eta_{gend}_{sent}'] = p_data
      
williamsbest = williamsbest.dropna()

In [None]:
wb_norm = (
  (williamsbest.transformed_score + np.abs(williamsbest.transformed_score.min())) 
  / np.max(williamsbest.transformed_score + np.abs(williamsbest.transformed_score.min()))
)
model_diff = williamsbest.eta_fem - williamsbest.eta_masc
model_diff_shift = model_diff + np.abs(model_diff.min())
model_norm = model_diff_shift / np.max(model_diff_shift)

In [None]:
spearmanr(williamsbest.transformed_score, model_diff)

In [None]:
np.mean((williamsbest.transformed_score < 0) == (williamsbest.eta_masc > williamsbest.eta_fem))

#### Bolukbasi correlation

In [None]:
import sys
sys.path.append('../debiaswe-master/debiaswe/')
import debias
import we
import json

embeddings_fpath = 'C:/users/ahoyl/datasets/word-embeddings/glove.840B.300d/glove.840B.300d.txt'
embeddings = we.WordEmbedding(embeddings_fpath)

with open('../debiaswe-master/data/definitional_pairs.json', 'r') as f:
  definitional = json.load(f)
  
gender_direction = we.doPCA(definitional, embeddings).components_[0]

In [None]:
from nltk.corpus import wordnet as wn

max_freq = 100000
topn = 5000
dots = np.square(embeddings.vecs[:max_freq].dot(gender_direction))
thresh = sorted(dots)[-topn]
words = [
  w for w, dot in zip(embeddings.words, dots)
  if dot >= thresh and len(wn.synsets(w, pos='a')) > 0
]
sorted(words, key=lambda w: embeddings.v(w).dot(gender_direction))[-topn:][::-1]

In [None]:
williamsbest = pd.read_csv('./data/human-eval/adjectives_williamsbest.csv')
williamsbest = williamsbest.groupby('word', as_index=False).mean()

In [None]:
for idx, word in williamsbest[['word']].itertuples():
  try:
    v = embeddings.v(word)
  except KeyError:
    v = np.nan
    continue
  williamsbest.at[idx, 'sim'] = gender_direction.dot(v)

In [None]:
spearmanr(williamsbest.dropna().sim, williamsbest.dropna().transformed_score)

## Evaluate sentiment differences

In [None]:
def map_topics_to_sents(topics, sent_data, trained_with_sent):
  '''
  Map the sentiment data
  '''
    
  term_sent_data = []
  for gender in ('masc', 'fem'):
      terms = topics[f'{gender}_neu_terms'].values
      if trained_with_sent:
        # should likely update to be sorted better
        cols = [c for c in topics.columns if gender in c]
        terms = topics[[c for c in cols if 'terms' in c]].values.flatten()
        vals_idx = topics[[c for c in cols if 'values' in c]].values.flatten().argsort()[-TERMS_PER_TOPIC:]
        terms = terms[vals_idx]
        print(terms[-5:])
        
      term_sents = (
        pd.DataFrame(terms)
          .merge(sent_data, how='inner', left_on=0, right_on='word')
          .reset_index()
      )
      term_sents['gender'] = gender
      term_sent_data.append(term_sents)
      
  return pd.concat(term_sent_data, ignore_index=True)

def topic_sent_perm_test(term_sense_data, nmc, abs, p_ci, alpha=0.05):
  '''
  Run the permutation test on the mapped data
  '''
  perm_test_data = []
  for sent in ('pos', 'neg', 'neu'):
      masc_values = term_sense_data.loc[(term_sense_data.gender == 'masc')][sent].values
      fem_values = term_sense_data.loc[(term_sense_data.gender == 'fem')][sent].values
      diff, p, ci = perm_test_two_sided(masc_values, fem_values, nmc=nmc, p_ci=p_ci, abs=abs)
      sig = '*' if p <= (alpha / 3.) else '' # bonferonni
      # p, sig, ci = 0, '', 0 # AVOID P-HACKING
      
      perm_test_data.append([sent, masc_values.mean(), fem_values.mean(), f'{p:0.3f}{sig}', np.round(ci, 3)])
  
  return pd.DataFrame(perm_test_data, columns=['Sentiment', 'Mean Masc', 'Mean Fem', 'p', 'C.I.'])

In [None]:
# normalize the sentiment data
sent_data[['pos', 'neg', 'neu']] = sent_data[['pos', 'neg', 'neu']].apply(lambda x: x / sent_data.sum(1), axis=0)

In [None]:
trained_with_sent = not np.all(topics_jj['masc_neg_terms'] == topics_jj['masc_pos_terms'])
trained_with_sent

In [None]:
term_sent_data_jj = map_topics_to_sents(
  topics=topics_jj,
  sent_data=sent_data,
  trained_with_sent=trained_with_sent,
)
term_sent_data_dobj = map_topics_to_sents(
  topics=topics_dobj,
  sent_data=sent_data,
  trained_with_sent=trained_with_sent,
)
term_sent_data_nsubj = map_topics_to_sents(
  topics=topics_nsubj,
  sent_data=sent_data,
  trained_with_sent=trained_with_sent,
)

In [None]:
perm_test_data_jj = topic_sent_perm_test(
  term_sent_data_jj,
  nmc=30000,
  abs=False,
  p_ci=0.99
)

perm_test_data_dobj = topic_sent_perm_test(
  term_sent_data_dobj,
  nmc=30000,
  abs=False,
  p_ci=0.99
)

perm_test_data_nsubj = topic_sent_perm_test(
  term_sent_data_nsubj,
  nmc=30000,
  abs=False,
  p_ci=0.99
)

In [None]:
perm_test_data_jj.round(2)

In [None]:
perm_test_data_dobj.round(2)

In [None]:
perm_test_data_nsubj.round(2)