# Google Drive setup
The script below creates a subdirectory on your Google Drive, which is populated with various Git repositories that we need.

Note that you need to change `gender_bias_dir` based on if we use our fork (which works with NL input) or the original repository (which works with EN input).

In [None]:
# Run this to mount Google Drive. It should show an input with instructions.
from google.colab import drive
import os

mount_dir = '/content/drive';
drive.mount(mount_dir);

root_dir = mount_dir + '/MyDrive/genderbias_project';
if not os.path.exists(root_dir):
  os.mkdir(root_dir);

%cd {root_dir}

# Auto install the required git repositories:
# Original GenderBias project
!git clone https://github.com/gabrielStanovsky/mt_gender.git mt_gender
# Our fork which contains additions for making NL language work
!git clone https://github.com/bert-w/mt_gender.git mt_gender_fork
# fast_align module required in the evaluations
!git clone https://github.com/clab/fast_align.git


gender_bias_dir = root_dir + '/mt_gender_fork';
# gender_bias_dir = root_dir + '/mt_gender';
fast_align_dir = root_dir + '/fast_align';

In [None]:
# List the contents of both projects to see if it worked.
print(gender_bias_dir, fast_align_dir)
!ls {gender_bias_dir} -al
!ls {fast_align_dir} -al

# Functions

In [None]:
# Load a csv file from the human annotations.
import pandas as pd
pd.set_option('max_colwidth', 500)

def read_human_annotations(file):
  return pd.read_csv(gender_bias_dir + file);

# df = read_human_annotations('/data/human_annotations/es.in - es.in.csv')
# display(df)

In [None]:
def read_aggregate(file):
  """Load a csv file from the aggregates.
  Options:
  - en.txt (mixed?)
  - en_anti.txt (anti-stereotypical like "female mechanic")
  - en_pro.txt (pro-stereotypical like "male construction worker")
  """
  return pd.read_csv(gender_bias_dir + file, sep='\t', names=['gender', 's_index', 'sentence', 'subject']);

# df = read_aggregate('/data/aggregates/en.txt');
# display(df)

In [None]:
def read_translation(file):
  return pd.read_csv(gender_bias_dir + file, sep='\ \|\|\|\ ', names=['en', 'nl'], engine='python');

In [None]:
import os
import time

def install_fast_align():
  """ Install fast_align from a directory in your Google Drive.
  """

  dir = fast_align_dir;

  if os.path.isfile(dir + '/build/fast_align'):
    return dir;

  %cd {dir}
  !echo "Current dir: $(pwd)"

  # Install required libs.
  !apt-get install libgoogle-perftools-dev libsparsehash-dev

  # Build instructions from https://github.com/clab/fast_align
  %mkdir -p build
  %cd build
  !cmake -S ..
  !make

  return dir;

def install_python_dependencies():
  %cd {gender_bias_dir}
  !chmod 755 -R {gender_bias_dir}
  # Run their install script for all required Python modules.
  !./install.sh
  pass

def evaluate_language(source='en', destination='es', translator='google'):
  """ Run a bash evaluation script from the repo to determine gender bias in the source file compared to the destination.

  For example evaluating 'en to 'es' with translator 'google' looks like:
  1. It translates the input file first if it does not exist yet, which creates /translations/google/en-es.txt.
  2. It then tries to align the translated file using fast_align, which creates /src/forward/en-es.align. I believe the alignment process
  is about mapping words from one language to another.
  3. The alignments are evaluated, and the output is stored in /output/en-es.txt.

  :param str source: The input, which is a language file from the /data/aggregates folder, e.g. use "en" for the "en.txt" file.
  :param str destination: A language file from the /data/aggregates folder, e.g. use "en" for the "en.txt" file.
  :param str translator: One of "google", "bing", "aws", "sota", "systran".
  """

  # Call the install_fast_align function which returns the path immediately if it's installed correctly.
  fast_align_path = install_fast_align()
  %env FAST_ALIGN_BASE={fast_align_path}
  %cd {gender_bias_dir}
  %mkdir -p output

  # Run some unix commands so the .sh files can be executed.
  !chmod 755 -R {root_dir}
  # Move to src folder because paths are relative (and it's required by the repository).
  %cd {gender_bias_dir}/src
  !echo "Current dir: $(pwd)"

  timestamp = time.strftime("%Y%m%d-%H%M%S");

  # Define the source file to be a txt file in the aggregates folder.
  source_file = '/data/aggregates/' + source + '.txt';
  # Define the output file to be in the "output" directory in the root of the GenderBias project.
  output_file = '../output/' + translator + '-' + source + '-' + destination + '.' + timestamp + '.txt';

  # Execute script from the repo.
  !../scripts/evaluate_language.sh {gender_bias_dir}{source_file} {destination} {translator} > {output_file}

  print('Output file: ' + gender_bias_dir + '/src/' + output_file);
  pass

# Preparation
Prepare a `/content/drive/MyDrive/genderbias_project/nl_en_translation_check.csv` file for preparing the Dutch ground truth.

In [None]:
import pandas as pd

def generate_translation_to_aggregate(source, aggregate):
  """Load a translation file from the repo (directory /translations) and try to
  change it to the format of the aggregate file (with the various headers).
  """
  # Read source which determines the translation output.
  df_t = source if type(source) is pd.core.frame.DataFrame else read_translation(source);
  # Read original aggegrate to re-use the existing labels.
  df_a = aggregate if type(aggregate) is pd.core.frame.DataFrame else read_aggregate(aggregate);

  # Build complete dataframe by joining the 2 above into the proper format.
  df = pd.concat([df_t['nl'], df_a[['gender', 's_index', 'subject']]], axis=1)
  df = df[['gender', 's_index', 'nl', 'subject']].rename({'nl': 'sentence'}, axis=1)  

  return df;

def prepare_translation_file():
  """Prepare a dataframe with columns from NL to the left and columns from EN to the right,
  so we can check the translations manually.
  """
  df_en = read_aggregate('/data/aggregates/en.txt');
  df = generate_translation_to_aggregate(source='/translations/google/en-nl.txt',
                                    aggregate=df_en);

  dfx = df.join(df_en, lsuffix='_nl', rsuffix='_en')
  return dfx
  
# dfx = prepare_translation_file()
# dfx.to_csv(root_dir + '/nl_en_translation_check.csv')

# Installation & Evaluation

In [None]:
# Installation (only required once)
install_fast_align()
install_python_dependencies()

## Setup API keys
If you are going to evaluate a language pair that has no translations yet (like `/translations/google/en-es.txt` etc), you need to add an API key for the given service. 

After a translation task, the output is automatically saved in the project folder so it does not need to be ran again (unless you remove it).

In [None]:
# Insert Google API key so we can use the translation API.
%env GOOGLE_APPLICATION_CREDENTIALS=

# Set default region for AWS services (London in this case).
%env AWS_DEFAULT_REGION=eu-west-2
# Set AWS user (restricted to using AWS Translate only).
%env AWS_ACCESS_KEY_ID=
%env AWS_SECRET_ACCESS_KEY=

# Set Bing/Microsoft Translator credentials. Make sure that these credentials have a paid plan, because the free plan has a request limit.
%env BING_TRANSLATOR_TEXT_KEY=
%env BING_TRANSLATOR_REGION_KEY=westeurope

## Evaluation

In [None]:
# Evaluate en-es (requires original GenderBias project, see very first code block).
# evaluate_language(source='en', destination='es', translator='google')

# Evaluate nl-es (requires our GenderBias fork here since it requires changes to support NL).
evaluate_language(source='nl', destination='es', translator='google')
# evaluate_language(source='nl', destination='ar', translator='aws')
# evaluate_language(source='nl', destination='ar', translator='bing')
# evaluate_language(source='nl', destination='es', translator='deepl')
