In [1]:
#If you want to use Google Colab GPU
import tensorflow as tf
import torch
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('GPU found at: ', device_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print("Device connected: ", torch.cuda.get_device_name(0))


GPU found at:  /device:GPU:0
Device connected:  Tesla P100-PCIE-16GB


In [2]:
#Install transformers if you dont have it installed already
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/12/b5/ac41e3e95205ebf53439e4dd087c58e9fd371fd8e3724f2b9b4cdb8282e5/transformers-2.10.0-py3-none-any.whl (660kB)
[K     |████████████████████████████████| 665kB 454kB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 5.5MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 6.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     

In [3]:
#Import necessary libs
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import XLNetModel,XLNetConfig, XLNetTokenizer, XLNetForSequenceClassification
from transformers import AdamW
import random
import nltk
from gensim.models import Word2Vec,word2vec
from tqdm import tqdm, trange
import pandas as pd
import io
import json
from sklearn.metrics import confusion_matrix, precision_score,recall_score,f1_score
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
% matplotlib inline

Using TensorFlow backend.


In [4]:
# Upload the train file from your local drive
#from google.colab import files
#uploaded = files.upload()
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


**Note that:** if you don't want to store the fine-tuned model before evualating in 
test set please copy the content below in train.ipynb and evaulate the model with out saving it

In [0]:
#Load model from directory  
model_file = "/path/to/model/"
config_file = "/path/to/config.json"
config = XLNetConfig.from_pretrained(config_file, num_labels=2, finetuning_task="finsen")
model = XLNetForSequenceClassification.from_pretrained(model_file,config = config)
model.to(device)

In [0]:
batch_size = 32  
MAX_LEN = 180

In [0]:
##---------------      TEST THE MODEL USING NEW UNSEEN DATASET         -------------

df = pd.read_csv("path/to/test.csv", delimiter = ";", header=None)
df = shuffle(df)

sentences = df[1].values
df[0] = (df[0]).astype(int)


sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
labels = df[0].values

tokenizer = XLNetTokenizer.from_pretrained('path/to/target/language/tokenizer.model')
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []

for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Calling XLNetTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [0]:
# Evaulate
model.eval()

predictions , true_labels = [], []
 
for batch in prediction_dataloader:

  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
    
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)


In [0]:
# Get the predictions
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

In [0]:
print("Precision score: ", precision_score(flat_true_labels, flat_predictions))
print("Recall score: ", recall_score(flat_true_labels, flat_predictions))
print("F1-Score score: ", f1_score(flat_true_labels, flat_predictions))

Precision score:  0.7856821251688428
Recall score:  0.8725
F1-Score score:  0.8268182895048567
