<a href="https://colab.research.google.com/github/aravind-sundaresan/huggingface-examples/blob/master/tweet_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers

In [None]:
from google.colab import drive
drive.mount("/drive")

import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification

## Loading the input data
The input dataset consists of tweets pertaining to disasters.

In [None]:
tweet_df = pd.read_csv("data/tweets.csv")

tweet_df.head()

Distribution of labels

In [None]:
tweet_df["target"].value_counts()

## Generating Train and Validation sets

In [None]:
x_train, x_val, y_train, y_val = train_test_split(tweet_df["text"].values, tweet_df["target"].values, test_size=0.2)

## Tokenizing the tweets

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
''' truncation=True and padding=True will ensure that all sequences are padded to the same length and 
 are truncated to be no longer than the model’s maximum input length '''

train_encodings = tokenizer(list(x_train), truncation=True, padding=True)
val_encodings = tokenizer(list(x_val), truncation=True, padding=True)

## Creating a Dataset object using the encodings and labels

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))

# Fine-tuning the pretrained model using native TensorFlow
The classifier is built by fine-tuning a pre-trained DistilBert model.

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=["accuracy"])

In [None]:
model.fit(train_dataset.shuffle(1000).batch(16), epochs=5, validation_data=val_dataset.shuffle(1000).batch(16))