In [1]:
#If you want to use Google Colab GPU
import tensorflow as tf
import torch
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('GPU found at: ', device_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print("Device connected: ", torch.cuda.get_device_name(0))


GPU found at:  /device:GPU:0
Device connected:  Tesla P4


In [2]:
#Install transformers if you dont have it installed already
!pip install transformers



In [3]:
#Import necessary libs
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
#import sentencepiece as spm
from transformers import XLNetModel,XLNetConfig, XLNetTokenizer, XLNetForSequenceClassification
from transformers import AdamW
import random
import nltk
from gensim.models import Word2Vec,word2vec
from tqdm import tqdm, trange
import pandas as pd
import io
import json
from sklearn.metrics import confusion_matrix, precision_score,recall_score,f1_score
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
% matplotlib inline

Using TensorFlow backend.


In [4]:
# If you want to mount your Google Drive to access dataset is in your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
#Load dataset to pandas DataFrame
df = pd.read_csv('path/to/train.csv', delimiter= ";",header = None)
df.head()

Unnamed: 0,0,1
0,0,ኣቲ ከልቢ በጃኪ አባ ምሓርና ኣይትሓፍርን ይዀነለይ ከ
1,1,ኣኤባሳኣ ጀኤገናኣ ታኣውዓተ ሕያተኤ ጋሌኤ ኤርትራኣውተ ተዝርበይው ዝላከኤ...
2,1,እንቋዕ ብዳሓን መጻኩም እንዳ ጅሮም ቖቃሕ ንኩሉኩም ተዋሳእቲ ኣብዚኣ ደቂ...
3,1,ዋውውውው ፋሚሊ ጆን ጽቡቕ ስራሕ
4,0,ይሁዳውያን የዕርኩቱ ሸጣዉያን ስርዓቱ ጋኒናት መንፈሶም


In [0]:
#Change datatype of our label
df[0] = (df[0]).astype(int)
#Shuffles data 
df = shuffle(df)
sentences = df[1].values
labels = df[0].values

In [7]:
#Add two tokens used in XLNet pretraining: [SEP] uses to separate sentences and [CLS] for classfication tasks
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]

#Load Tigrinya tokenizer 
tokenizer = XLNetTokenizer.from_pretrained('path/to/target-Tigrinya/language/tokenizer.model')
#Tokenize input sentences using a new Tigrinya sentencePiece model
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence of Tigrinya comment:")
print (tokenized_texts[0])

Calling XLNetTokenizer.from_pretrained() with the path to a single file or url is deprecated


Tokenize the first sentence of Tigrinya comment:
['▁ኣነ', '▁ወዲ', '▁ስዉእ', '▁እየ', '▁ሓደ', '▁ጻ', 'ን', 'ታን', '▁በዓል', '▁ነርኪ', '▁ብጾት', '▁ኣቦይ', '▁ክብለኩም', '▁ብፍጹም', '▁ኣይ', 'ደፍር', 'ን', '▁ምስ', '▁ደቀይ', 'ን', '▁ሰበይተይ', '▁ኣብ', '▁ስደት', '▁ኣታ', '▁ኮመን', 'ተይ', '▁ተንብብ', '▁ዘለካ', '▁ሃገራዊ', '▁ዓቕምና', '▁ፈጺምና', '▁ኢና', '▁ኣነ', 'ን', '▁ሰበይተይ', 'ን', '▁ደቀይ', '▁ግን', '▁ኣበ', 'ደን', '▁ኣውጺአ', 'ዮም', '▁ኣለኩ', '▁ከምቶም', '▁ሓለፍትን', '▁ደገ', '▁ዘለኩም', 'ን', '▁ደቅ', 'ኩም', '▁ሓቢ', 'እኩም', '▁ዘለኩም', '▁ሃገር', '▁ን', 'በይ', 'ን', 'ና', '▁ኣይት', 'ምልከተ', 'ናን', '▁እያ', '▁ኣይት', 'ዕ', 'ምጽ', 'ን', '▁ህግደፍ', '▁ውጉኣት', '▁ኣቦታትና', '▁ኮንኩም', '▁ኣይትፈልጡን', '▁ትዕግስቲ', '▁ዶ', '▁ኣቢልኪ', '▁ሳላ', 'ሳ', '▁ዓመት', '▁ብሰብ', 'ዓ', '▁ኢልኩም', '▁ስልጣን', '▁ሒዝኩም', '▁ሰላሳ', '▁ዓመት', '▁ገዚእ', 'ኩምና', '▁ይኣክለኩም', '▁ይኣክል', '▁ተወሳኪ', '▁እየ', '▁ዘረባ', '▁ኣይን', 'ፈቱ', 'ን', '▁ድያ', '▁ዝበለት', '▁እወ', '▁ሓቅ', 'ኪ', '▁ሓቅነት', '▁ዘይብልኩም', '▁ከመይ', '▁ክትዛረቡ', '▁እስኩም', '▁ትፈትው', 'ዎ', '▁ምእሳር', 'ን', '▁ምቅታል', 'ን', '▁ካብ', '▁ዓዲ', '▁ንዝ', 'ቃወመ', 'ኩም', '▁አ', 'ሲ', 'ርካ', '▁ክሰርሕ', '▁ክነ', 'ግድ', '▁ዝደለየ', '▁ቦታ', '▁ምዕጻው', '▁ባዕልኩም', '▁ነጋዶ', '▁ባዕል

In [0]:
# Set the maximum sequence length  (The longer the better if you have enough RAM and have a longer sentences)
MAX_LEN = 180
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [0]:
# Pad the input tokens to match the MAX length set
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
attention_masks = []
# Mask: 1s for each token and 0s for padding and create attention masks
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [0]:
# Split training data into Train/Validation (80%/20%)

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.2)



In [0]:
# Create torch tensors 

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)


In [0]:
# Model hyperparameters
LEARNING_RATE = 2e-5
batch_size = 32
epochs = 3

In [0]:
# Creates torch dataloaders for training and validation sets 

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [14]:
# Loading pre-trained English XLNet model with 2 output layers 
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
model.cuda()


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [0]:
# Get default model parameters and set weight decay for the following params
no_decay = ['bias', 'gamma', 'beta']
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# Creates an optimizer with a learning rate of LEARNING_RATE
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)

In [0]:
# Calculate F1-Score 
def get_f1score(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(pred_flat, labels_flat)


In [0]:
# Load Word2Vec token embeddings trained for target language (Note that the txt file(token embeddings) can be created using Word2Vec_token_embeddings_for_xlnet.ipynb )
# And initialize our XLNet model with the target language embeddings
i_embeddings = np.loadtxt('path/to/learned/initial_embeddings.txt', dtype=float)
i_embeddings = torch.nn.Embedding.from_pretrained(torch.FloatTensor(i_embeddings)).to(device)
model.set_input_embeddings(i_embeddings)

In [0]:
# Freeze the embedding layer to prevent from updating the pre-trained weights 
for name, param in model.named_parameters():                  
    if name.startswith('transformer.word_embedding'):
        param.requires_grad = False
    else:
      param.requires_grad = True

In [0]:
# Main training, loss and accuracy 
train_loss_set, total_train_loss = [],[]

for _ in trange(epochs, desc="Epoch"):
  
  
  # Train
  model.train()

  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  avg_loss = 0

  for step, batch in enumerate(train_dataloader):

    batch = tuple(t.to(device) for t in batch)
    input_ids, b_input_mask, b_labels = batch
    optimizer.zero_grad()

    outputs = model(input_ids = input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs[0]
    logits = outputs[1]
    avg_loss += loss.item()
    total_train_loss.append(loss.item())
    
    if step % 1000 == 0 and step != 0:
      train_loss_set.append(avg_loss/1000)
      avg_loss = 0
    
    train_loss_set.append(loss.item())    
    loss.backward()

    optimizer.step()
    
    
    tr_loss += loss.item()
    nb_tr_examples += input_ids.size(0)
    nb_tr_steps += 1

  # Validate
  model.eval()

  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  for batch in validation_dataloader:

    batch = tuple(t.to(device) for t in batch)
    input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
      output = model(input_ids = input_ids, token_type_ids=None, attention_mask=b_input_mask)
      logits = output[0]
    
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = get_f1score(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("\n Validation Accuracy: ", eval_accuracy/nb_eval_steps)


**Note that:** if you don't want to store the fine-tuned model before evualating in test set please copy the content of test.ipynb and evaulate the model with out saving it

In [0]:
# Save the model 
model_save = '/path/'
model.save_pretrained(model_save)