# *Gender Bias in Virtual Assistants Project*

### Identificando Preconceitos de Gênero em Assistentes Virtuais

![From Google](https://media-assets-01.thedrum.com/cache/images/thedrum-prod/s3-news-tmp-145694-untitled_design_70--default--1280.png)

### Equipe formada por
- Bárbara Stéphanie Neves Oliveira, 507526
- Lucas Benjamim Cunha Bandeira, 507533
- Samir Braga Chaves, 513788
- Vinicius Bernardo Gabriel, 475210

# BERT *Classifier for MDGender Dataset* 

<center>
  <img width="600" src="https://drive.google.com/uc?id=11KlguWCUIDUzA8bN5dtzzab4tiY_2pz4"/>
</center>


---

### Classificação Multiclasse

<center>
  <img width="400" src="https://lena-voita.github.io/resources/lectures/text_clf/intro/example_document-min.png"/>
</center>

---

### *Transfer-Learning*

<center>
  <img width="700" src="https://lena-voita.github.io/resources/lectures/transfer/intro/idea-min.png"/>
</center>

# Warning

**Para reproducibilidade, a execução deste *script* deve ser feita pelo Google Colab para uso da TPU.**

**Caso não queira, sugerimos comentar no módulo `model.py` toda a configuração da TPU.**

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.metrics import AUC
from tensorflow_addons.metrics import F1Score

from sklearn.metrics import classification_report

BASE_PATH = '...'
DATASETS_PATH = BASE_PATH + "/resources/datasets"

In [None]:
# Adicionando módulos customizados ao path
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append(BASE_PATH + '/modules')

import utils, text, model
from importlib import reload

# Leitura dos Dados

In [None]:
df = pd.read_csv(DATASETS_PATH + '/MDGender/md_gender_bias/new_data/new_data.csv')

df.head()

In [None]:
labels = ['female', 'male']

# Treinamento do BERT *Classifier*

<center>
  <img width="700" src="https://lena-voita.github.io/resources/lectures/transfer/bert/intro-min.png"/>
</center>

## Representação Textual

<center>
  <img width="700" src="https://lena-voita.github.io/resources/lectures/transfer/bert/bert_input.gif"/>
</center>

In [None]:
padded_tokens_ids, padded_masked_ids = text.padding_sequences(df['clean_text'].to_numpy(), 
                                                              max_length=text.MAX_LENGTH)

In [None]:
padded_tokens_ids.shape, padded_masked_ids.shape

## Divisão Estratificada

In [None]:
dict_padded_tokens_ids = {k: v for k, v in enumerate(padded_tokens_ids)}
dict_padded_masked_ids = {k: v for k, v in enumerate(padded_masked_ids)}

y = df[['label_pos_0', 'label_pos_1']].to_numpy()

X_train_ids, y_train, X_test_ids, y_test, X_dev_ids, y_dev = text.data_split(padded_tokens_ids, y, test_size=0.2, dev_size=0.1, random_seed=42)

(X_train_ids.shape, y_train.shape), (X_test_ids.shape, y_test.shape), (X_dev_ids.shape, y_dev.shape)

In [None]:
X_train_masks = np.array([dict_padded_masked_ids[i] for i in utils.get_keys_by_values(dict_padded_tokens_ids, X_train_ids)])
X_test_masks = np.array([dict_padded_masked_ids[i] for i in utils.get_keys_by_values(dict_padded_tokens_ids, X_test_ids)])
X_dev_masks = np.array([dict_padded_masked_ids[i] for i in utils.get_keys_by_values(dict_padded_tokens_ids, X_dev_ids)])

X_train_masks.shape, X_test_masks.shape, X_dev_masks.shape

In [None]:
from collections import Counter
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix

result_comb_matrix = pd.DataFrame({
    'data': Counter(str(combination) for row in get_combination_wise_output_matrix(y, order=1) for combination in row),
    'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train, order=1) for combination in row),
    'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test, order=1) for combination in row),
    'dev' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_dev, order=1) for combination in row)
}).T.fillna(0.0)

result_comb_matrix.style.background_gradient(cmap=plt.get_cmap('Pastel2'))

In [None]:
test_data = pd.DataFrame(columns=['tokens_ids', 'masked_ids', 'y'])

for ids, masks, y in zip(X_test_ids, X_test_masks, y_test):
  test_data = test_data.append({
      'tokens_ids': ids,
      'masked_ids': masks,
      'y': y
  }, ignore_index=True)

test_data.to_json(DATASETS_PATH + '/MDGender/md_gender_bias/new_data/padded_test_data.json', orient='records')

## Treino do Modelo

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

auc_score = AUC(multi_label=True)
# f1_score = F1Score(num_classes=y.shape[1], threshold=0.5, average='weighted')

model_path = BASE_PATH + '/Resources/Models/best_bert_linear_classifier2_md_gender.h5'

model_meta = model.run_text_classifier(model.bert_linear_classifier2,
                                       [X_train_ids, X_train_masks], y_train,
                                       [X_dev_ids, X_dev_masks], y_dev,
                                       batch_size=32,
                                       max_epochs=50,
                                       patience=20,
                                       eval_metric=auc_score,
                                       monitor='val_auc',
                                       source_length=text.MAX_LENGTH,
                                       hidden_units=128,
                                       labels_size=len(labels), 
                                       model_path=model_path)

In [None]:
model_meta['run_time'], model_meta['start'], model_meta['final']

In [None]:
model.plot_model_loss_score(model_meta['history'], score_name='auc')

In [None]:
model_meta['model'].load_weights(model_path)
y_pred = np.rint(model_meta['model'].predict([X_test_ids, X_test_masks]))

print(classification_report(y_test, y_pred, target_names=labels))