In [47]:
import io
import re

# Store the linguistics dictionary
def read_fr_dictionary(ling_info_list):
  word_to_ling = {}
  for item in ling_info_list:
    key = item[0]
    val = item[1]
    # When the lemma is identical to the surface form,
    # this dictionary just doesn't say the word twice,
    # leaving a blank '.'
    if val[0] == '.':
      val = key + val
    word_to_ling[key] = val
  return word_to_ling

def preprocess_french():
  # The dictionary that we use to look up how the words need to be changed to get gender agreement, etc.
  linguistic_info = io.open('dela-fr-public.dic','r', encoding='utf-16-le').readlines()
  # The vocabulary for the gender, age, nationality, etc. perturbations
  vocabulary = io.open('french_vocabulary.csv', 'r+').readlines()
  # The sentences we are perturbing to create the evaluation data
  sentences = io.open('french_sentences.csv', 'r+').readlines()
  ling_info_list = [line.strip('\n').lower().split(',') for line in 
                    linguistic_info]
  ling_to_word = {entry[1]:entry[0] for entry in ling_info_list}
  word_to_ling = read_fr_dictionary(ling_info_list)
  return sentences, vocabulary, ling_to_word, word_to_ling

sentences, vocabulary, ling_to_word, word_to_ling = preprocess_french()


In [48]:
# Input: The multilingual terms for Gender, Age, Nationality, categorized by 
# formality/familial status/etc.
def parse_csv(vocabulary):
  for line in vocabulary:
    line = line.strip()
    split_line = line.split(',')
    # The Word is at the end of the feature list. 
    # It should be at the start.
    word = split_line[0]
    print("Adding %s" % word)
    characteristics = split_line[1:]
    word_characteristics_map[word] = characteristics
    characteristics_word_map[tuple(characteristics)] = word
  return word_characteristics_map, characteristics_word_map

# Read the vocabulary file, which should have:
# age,gender,word,language,characteristics,part_of_speech,characteristic_value,
# count,formality,family term,age bucket
mappings = parse_csv(vocabulary)
# Map these things to and from the Word.
word_characteristics_map, characteristics_word_map = mappings

Adding # word
Adding mec
Adding meuf
Adding mecs
Adding meufs
Adding elle
Adding lui
Adding elles
Adding ils
Adding homme
Adding hommes
Adding femme
Adding femmes
Adding fils
Adding fille
Adding 


In [62]:
def perturb_word(characteristics):
  # Change the value of the slot that's being filled.
  # The slots are Nouns and Pronouns: gender+age terms, familial status, etc.
  # TODO: Make this work -- not m, f (hack) but actually iterating through the slot fillers
  gender = characteristics[1]
  print(gender)
  for target in target_categories:
    new_characteristics = tuple([characteristics[0]] + [target] + characteristics[2:])
  new_word = characteristics_word_map[new_characteristics]
  return new_word

# Looking at slots just for male/female gender
# TODO: Implement using this.
target_categories = ['m', 'f']
# For each of the stereotypes:
for line in sentences:
  perturbed_sentence_list = []
  words = line.strip().split()
  print("Looking at: %s " % line.strip())
  # For each word in the sentence
  for word in words:
    word = word.lower()
    # If we have linguistic details about the word in the dictionary,
    # use them. This is where we grab the values for the 'slot' we're filling.
    if word in word_characteristics_map:
      characteristics = word_characteristics_map[word]
      word = perturb_word(characteristics)
    else:
      # Grab the linguistic details in the dictionary:
      # surface form, lemma, pos, gender, person, plurality, (etc)
      if word in word_to_ling:
        details = word_to_ling[word]
        # Isolate the gender value
        split_gender = details.split(':')
        # Isolate the plurality
        gender_number = split_gender[1:]
        # Isolate the lemma and the pos tag
        split_deets = split_gender[0].split('.')
        lemma = split_deets[0]
        pos = split_deets[1]
        #print(lemma)
        #print(pos)
        #print(gender_number)
        # If the gender is a single slot value, perturb to all the other values
        # for that slot.
        if len(gender_number) == 1:
          new_gender_number = re.sub('m', 'f', gender_number[0])
          # Create the string that indexes the set of characteristics we want,
          # as they would be written in the dictionary:
          ling_details = lemma + "." + pos + ":" + new_gender_number
          #print(new_details)
          #print(ling_to_word[new_details])
          if ling_details in ling_to_word:
            word = ling_to_word[ling_details]
    perturbed_sentence_list += [word]
  # Translated and perturbed sentence.
  print("Result:", end="\t")
  print(" ".join(perturbed_sentence_list))
  print()

Looking at: Les mecs ils sont tous nuls 
m
m
Result:	les meufs elles sont toutes nulles

Looking at: Les hommes ne savent pas faire la vaisselle 
m
Result:	les femmes ne savent pas faire la vaisselle

Looking at: Pourquoi tous les hommes sont obsédés ? 
m
Result:	pourquoi toutes les femmes sont obsédées ?

Looking at: Pourquoi les femmes tiennent peu le volant lors des longs trajets ? 
f
Result:	pourquoi les femmes tiennent peu la volante lors des longues trajets ?

Looking at: C'est bien d'essayer , mais bon , les filles ne savent pas coder 
Result:	c'est bien d'essayer , mais bonne , les filles ne savent pas coder

Looking at: Les filles aiment le rouge 
Result:	les filles aiment la rouge

Looking at: Les filles aiment le rose 
Result:	les filles aiment la rose

Looking at: Les garçons aiment le bleu et le vert 
Result:	les garçons aiment la bleue et la verte

Looking at: Les filles devraient jouer à la poupée 
Result:	les filles devraient jouer à la poupée

Looking at: Les garçons d