# EXIST 2021 - BERT predicts <a class="anchor" id="bert-preds"></a>

    ÁLVARO FAUBEL SANCHIS
    CLARA MARTÍ TORREGROSA

#####  Table of contents :
- [Requiered functions](#functions)
 * [Required libraries and configuration](#libraries)
 * [Cleaning](#cleaning)
 * [BERT](#bert)
     
- [EXIST Task](#exist)
 * [Data load](#data-load)
 * [Predictions](#preds)
     * [Spanish](#sp)
         - [Task 1](#sp-t1)
         - [Task 2](#sp-t2)
     * [English](#en)
         - [Task 1](#en-t1)
         - [Task 2](#en-t2)
 * [Submission results](#submission)


---

### Required functions <a class="anchor" id="functions"></a>

#### Required libraries and configuration <a class="anchor" id="libraries"></a>

In [None]:
# Data & visualization
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
import matplotlib.pyplot as plt

# Pytoch 
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

# BERT Hugging Face
import transformers
from transformers import BertModel, BertTokenizer, AutoTokenizer, AutoModel
from transformers import  AdamW, get_linear_schedule_with_warmup

# Sklearn: tecnics & methods 
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report, f1_score

# Configuración: seed & display
RANDOM_SEED = 45
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#### Cleaning (optional) <a class="anchor" id="cleaning"></a>

In [2]:
import preprocessor as p
import re

emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  
                            u"\U0001F300-\U0001F5FF"  
                            u"\U0001F680-\U0001F6FF"  
                            u"\U0001F1E0-\U0001F1FF"  
                            "]+", flags=re.UNICODE)

def clean_tweet(tweet, special_tokens = True, emoji_pattern = emoji_pattern):
    if special_tokens:
        text_clean = p.tokenize(tweet)
    else:
        text_clean = p.clean(tweet)
    text_clean_emoji = emoji_pattern.sub(r'', text_clean)
    return text_clean_emoji

#### BERT <a class="anchor" id="bert"></a>

* Class for structure the data

In [3]:
class TweetsDataset(Dataset):
    def __init__(self, tweets, tokenizer, max_len):
        self.tweets = tweets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        encoding = self.tokenizer.encode_plus(tweet,
                                              add_special_tokens=True,
                                              max_length=self.max_len,
                                              return_token_type_ids=False,
                                              pad_to_max_length=True,
                                              return_attention_mask=True,
                                              return_tensors='pt')
    
        return {'tweet_text': tweet,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten()}

* Creation of data loaders objects for the segmentation in packets (batch)

In [4]:
def create_data_loader(df, column, tokenizer, max_len, batch_size):
    ds = TweetsDataset(tweets=df[column].to_numpy(),
                       tokenizer=tokenizer,
                       max_len=max_len)
    return DataLoader(ds, batch_size=batch_size, num_workers=0)

* Custom classifier from pre-trained BERT model

In [5]:
class BERTClassifier(nn.Module):
    def __init__(self, n_classes, model_name):
        super(BERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name) 
        self.dropout = nn.Dropout(0.1)
        self.relu =  nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,n_classes)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input_ids, attention_mask):
        _,cls_hs = self.bert(input_ids, attention_mask=attention_mask, return_dict=False)   
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

* Get the predictions

In [6]:
def get_predictions(model, data_loader):
    model = model.eval()
    
    tweets_texts = []
    predictions = []
    prediction_probs = []
  
    with torch.no_grad():
        for d in data_loader:
            
            texts = d["tweet_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            #probabilities = torch.nn.functional.softmax(outputs, dim=1)
            #preds = (probabilities[:,1] > 0.7).float()
            
            _, preds = torch.max(outputs, dim=1)
            
            tweets_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
    
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    return predictions, prediction_probs

---

# EXIST Task <a class="anchor" id="exist"></a>

## Data load <a class="anchor" id="data-load"></a>

#### Test data

In [7]:
df = pd.read_csv('../EXIST2021_dataset/test/EXIST2021_test.tsv', sep='\t')

df.head()

Unnamed: 0,test_case,id,source,language,text
0,EXIST2021,6978,gab,en,Pennsylvania State Rep horrifies with opening ...
1,EXIST2021,6979,twitter,en,"@iilovegrapes He sounds like as ass, and very ..."
2,EXIST2021,6980,twitter,en,"@averyangryskel1 @4ARealistParty LOL! ""This be..."
3,EXIST2021,6981,twitter,en,@WanderOrange @stalliontwink Rights?I mean yea...
4,EXIST2021,6982,twitter,en,the jack manifold appreciation i’m seeing is o...


* Split the dataset into Spanish and English

In [8]:
df_es = df[df['language'] == 'es']
df_en = df[df['language'] == 'en']

---

## Predicctions <a class="anchor" id="preds"></a>

### Spanish <a class="anchor" id="sp"></a>

#### Task1 <a class="anchor" id="sp-t1"></a>

* Cleaning

In [9]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.SMILEY, p.OPT.RESERVED) #, p.OPT.HASHTAG p.OPT.NUMBER)

df_es['text_clean'] = [clean_tweet(tw, special_tokens=True) for tw in df_es.text]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_es['text_clean'] = [clean_tweet(tw, special_tokens=True) for tw in df_es.text]


* Load the trained and tuned model and tokenizer, and set its parameters

In [10]:
MAX_LEN_ES1 = 155
BATCH_SIZE_ES1 = 16

PRE_TRAINED_MODEL_NAME_ES1 = 'dccuchile/bert-base-spanish-wwm-cased'
tokenizer_es1 = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME_ES1)

model_es1 = BERTClassifier(2, PRE_TRAINED_MODEL_NAME_ES1)
model_es1.load_state_dict(torch.load('../Models/task1_spanish.bin'))
model_es1 = model_es1.to(device)

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


* Create the input data-loader

In [11]:
data_loader_es1 = create_data_loader(df_es, 'text_clean', tokenizer_es1, MAX_LEN_ES1, BATCH_SIZE_ES1)

* We get the predictions

In [12]:
y_pred_es1, y_pred_probs_es1 = get_predictions(model_es1, data_loader_es1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
df_es['task1_encoding'] = y_pred_es1.numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_es['task1_encoding'] = y_pred_es1.numpy()


---

#### Task2 <a class="anchor" id="sp-t2"></a>

* Separate the examples classified as non-sexist

In [14]:
df_es2 = df_es.drop(df_es.loc[df_es['task1_encoding']==0].index)

* Cleaning

In [15]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.SMILEY, p.OPT.RESERVED) #, p.OPT.HASHTAG p.OPT.NUMBER)

df_es2['text_clean'] = [clean_tweet(tw, special_tokens=True) for tw in df_es2.text]

* Load the trained and tuned model and tokenizer, and set its parameters

In [16]:
MAX_LEN_ES2 = 120
BATCH_SIZE_ES2 = 16

PRE_TRAINED_MODEL_NAME_ES2 = 'dccuchile/bert-base-spanish-wwm-cased'
tokenizer_es2 = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME_ES2)

model_es2 = BERTClassifier(5, PRE_TRAINED_MODEL_NAME_ES2)
model_es2.load_state_dict(torch.load('../Models/task2_spanish.bin'))
model_es2 = model_es2.to(device)

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


* Create the input data-loader

In [17]:
data_loader_es2 = create_data_loader(df_es2, 'text_clean', tokenizer_es2, MAX_LEN_ES2, BATCH_SIZE_ES2)

* We get the predictions

In [18]:
y_pred_es2, y_pred_probs_es2 = get_predictions(model_es2, data_loader_es2)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
df_es2['task2_encoding'] = y_pred_es2.numpy()

---

### English <a class="anchor" id="en"></a>

#### Task1 <a class="anchor" id="en-t1"></a>

* Cleaning

In [20]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.SMILEY, p.OPT.RESERVED, p.OPT.HASHTAG, p.OPT.NUMBER)

df_en['text_clean'] = [clean_tweet(tw, special_tokens=True) for tw in df_en.text]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['text_clean'] = [clean_tweet(tw, special_tokens=True) for tw in df_en.text]


* Load the trained and tuned model and tokenizer, and set its parameters

In [21]:
MAX_LEN_EN1 = 170
BATCH_SIZE_EN1 = 16

PRE_TRAINED_MODEL_NAME_EN1 = 'cardiffnlp/twitter-roberta-base'
tokenizer_en1 = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME_EN1)

model_en1 = BERTClassifier(2, PRE_TRAINED_MODEL_NAME_EN1)
model_en1.load_state_dict(torch.load('../Models/task1_english.bin'))
model_en1 = model_en1.to(device)

* Create the input data-loader

In [22]:
data_loader_en1 = create_data_loader(df_en, 'text_clean', tokenizer_en1, MAX_LEN_EN1, BATCH_SIZE_EN1)

* We get the predictions

In [23]:
y_pred_en1, y_pred_probs_en1 = get_predictions(model_en1, data_loader_en1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [24]:
df_en['task1_encoding'] = y_pred_en1.numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en['task1_encoding'] = y_pred_en1.numpy()


---

#### Task2 <a class="anchor" id="en-t2"></a>

* Separate the examples classified as non-sexist

In [25]:
df_en2 = df_en.drop(df_en.loc[df_en['task1_encoding']==0].index)

* Cleaning

In [26]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.SMILEY, p.OPT.RESERVED, p.OPT.HASHTAG, p.OPT.NUMBER)

df_en2['text_clean'] = [clean_tweet(tw, special_tokens=True) for tw in df_en2.text]

* Load the trained and tuned model and tokenizer, and set its parameters

In [27]:
MAX_LEN_EN2 = 170
BATCH_SIZE_EN2 = 16

PRE_TRAINED_MODEL_NAME_EN2 = 'cardiffnlp/twitter-roberta-base'
tokenizer_en2 = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME_EN2)

model_en2 = BERTClassifier(5, PRE_TRAINED_MODEL_NAME_EN2)
model_en2.load_state_dict(torch.load('../Models/task2_english.bin'))
model_en2 = model_en2.to(device)

* Create the input data-loader

In [28]:
data_loader_en2 = create_data_loader(df_en2, 'text_clean', tokenizer_en2, MAX_LEN_EN2, BATCH_SIZE_EN2)

* we get the predictions

In [29]:
y_pred_en2, y_pred_probs_en2 = get_predictions(model_en2, data_loader_en2)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [30]:
df_en2['task2_encoding'] = y_pred_en2.numpy()

---

## Submission results <a class="anchor" id="submission"></a>

* Concatenate the dataframes obtained

In [31]:
df_task1 = pd.concat([df_en, df_es])
df_task2 = pd.concat([df_en2, df_es2])
df_both = pd.merge(df_task1, df_task2[['id', 'task2_encoding']], left_on='id',  right_on='id', how='outer')

df_both['task2_encoding'] = df_both['task2_encoding'].fillna('non-sexist')
df_both['task1'] = df_both['task1_encoding'].replace({0: 'non-sexist',
                                               1: 'sexist'})

df_both['task2'] = df_both['task2_encoding'].replace({0: 'ideological-inequality',
                                                        1: 'stereotyping-dominance',
                                                        2: 'misogyny-non-sexual-violence',
                                                        3: 'sexual-violence', 
                                                        4: 'objectification'})
df_both.head()

Unnamed: 0,test_case,id,source,language,text,text_clean,task1_encoding,task2_encoding,task1,task2
0,EXIST2021,6978,gab,en,Pennsylvania State Rep horrifies with opening ...,Pennsylvania State Rep horrifies with opening ...,0,non-sexist,non-sexist,non-sexist
1,EXIST2021,6979,twitter,en,"@iilovegrapes He sounds like as ass, and very ...","$MENTION$ He sounds like as ass, and very cond...",0,non-sexist,non-sexist,non-sexist
2,EXIST2021,6980,twitter,en,"@averyangryskel1 @4ARealistParty LOL! ""This be...","$MENTION$ $MENTION$ LOL! ""This behavior of not...",1,0.0,sexist,ideological-inequality
3,EXIST2021,6981,twitter,en,@WanderOrange @stalliontwink Rights?I mean yea...,$MENTION$ $MENTION$ Rights?I mean yeah most wo...,1,0.0,sexist,ideological-inequality
4,EXIST2021,6982,twitter,en,the jack manifold appreciation i’m seeing is o...,the jack manifold appreciation i’m seeing is o...,0,non-sexist,non-sexist,non-sexist


* Format the results for submission

In [32]:
df_final1 = pd.DataFrame({'test_case': df_both['test_case'],
                        'id': df_both['id'].apply(lambda x: str(x).zfill(6)),
                        'task1': df_both['task1']})
df_final1.head()

Unnamed: 0,test_case,id,task1
0,EXIST2021,6978,non-sexist
1,EXIST2021,6979,non-sexist
2,EXIST2021,6980,sexist
3,EXIST2021,6981,sexist
4,EXIST2021,6982,non-sexist


In [33]:
df_final1.to_csv('../Submission/exist2021_Alclatos/task1_Alclatos_1.tsv', sep='\t', header=None, index=False)

In [34]:
df_final2 = pd.DataFrame({'test_case': df_both['test_case'],
                        'id': df_both['id'].apply(lambda x: str(x).zfill(6)),
                        'task2': df_both['task2']})
df_final2.head()

Unnamed: 0,test_case,id,task2
0,EXIST2021,6978,non-sexist
1,EXIST2021,6979,non-sexist
2,EXIST2021,6980,ideological-inequality
3,EXIST2021,6981,ideological-inequality
4,EXIST2021,6982,non-sexist


In [35]:
df_final2.to_csv('../Submission/exist2021_Alclatos/task2_Alclatos_1.tsv', sep='\t', header=None, index=False)

---