In [None]:
!pip install --no-cache-dir transformers sentencepiece
!pip install wget

In [None]:
import tensorflow as tf
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pandas as pd
import torch
import wget
import os

In [None]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0", do_lower_case=True, use_fast=False)
model = AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")

In [None]:
input_ids = torch.tensor(tokenizer.encode("@user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user Infatti io per questo ho votato lega 💚 sentii le sue promesse e priorità contro i clandestini e l' Islam 💚 ")).unsqueeze(0)  
token_list = tokenizer.convert_ids_to_tokens(tokenizer.encode("@user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user Infatti io per questo ho votato lega 💚 sentii le sue promesse e priorità contro i clandestini e l' Islam 💚 ")) 

In [None]:
print(input_ids)
print(token_list)

In [None]:
url = 'https://raw.githubusercontent.com/alessandrocuda/SaRaH/main/dataset/haspeede2_dev/haspeede2_dev_taskAB.tsv'

# Load the dataset into a pandas dataframe.
df = pd.read_csv(url, delimiter=r'\t', header=None, engine='python')
df.columns =['id', 'sentences', 'hs', 'stereotype'] 
df = df.drop([0])
# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df

In [None]:
sentences = df.sentences.values
labels = np.array(df.hs.values, dtype='float')

In [None]:
print(sentences[6215])

In [None]:
max_len_token_sent = 0
max_token_sent = 0
max_len_str_sent = 0
max_str_sent = 0
id = 0
ids = []
# For every sentence...
for sent in sentences:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len_token_sent = max(max_len_token_sent, len(input_ids))
    if max_len_token_sent == len(input_ids):
        max_token_sent = sent
    max_len_str_sent = max(max_len_str_sent, len(sent))
    if max_len_str_sent == len(sent):
        max_str_sent = sent
    if len(sent) >280:
      ids.append(id)
    id +=1

print('Max Token sentence length: ', max_len_token_sent)
print(max_token_sent)
print('Max string sentence length: ', max_len_str_sent)
print(max_str_sent)
print(len(ids))

In [None]:
sentences = np.delete(sentences, ids)
labels = np.delete(labels, ids)

In [None]:
max_len = 0
max_sent = 0
# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
    if max_len == len(input_ids):
        max_sent = sent

print('Max sentence length: ', max_len)
print(max_sent)

In [None]:
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 90,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
input_ids.to(device)
attention_masks.to(device)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)