<a href="https://colab.research.google.com/github/aravind-sundaresan/huggingface-examples/blob/master/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers

In [None]:
from google.colab import drive
drive.mount('/drive')

In [None]:
from pathlib import Path

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        print(label_dir)
        count = 0
        for text_file in (split_dir/label_dir).iterdir():
            if count > 4000:
              break
            print(count)
            count += 1
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels

train_texts, train_labels = read_imdb_split('/drive/My Drive/Deep Learning/Sentiment Analysis/train')

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

In [None]:
# Tokenizing the documents

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
# truncation=True and padding=True will ensure that all sequences are padded to the same length and 
# are truncated to be no longer than the model’s maximum input length
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
# Creating a Dataset object using the encodings and labels

import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

In [None]:
# Fine-tuning the pretrained model using the Trainer class from the Transformers library

from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

training_arguments = TFTrainingArguments(
    output_dir='',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_dir='',
    logging_steps=10
)

In [None]:
with training_arguments.strategy.scope():
  model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

trainer = TFTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Fine-tuning the pretrained model using native TensorFlow

from transformers import TFDistilBertForSequenceClassification

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(1000).batch(16), epochs=1, validation_data=val_dataset.shuffle(1000).batch(16))