In [1]:
import pandas as pd
from transformers import BertTokenizer,AutoTokenizer,TFAutoModelForSequenceClassification,TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import pipeline
import tensorflow as tf
from tensorflow import keras

In [2]:
import tensorflow

### sample encoding of text

In [16]:
sample_text="Part of essential essense of life is to stay alive"
encoded_text=tokenizer.encode(sample_text)
encoded_text

[101, 2112, 1997, 6827, 29032, 3366, 1997, 2166, 2003, 2000, 2994, 4142, 102]

In [17]:
sample_text="The essense of life is to stay alive"
encoded_text=tokenizer.encode(sample_text)
encoded_text

[101, 1996, 29032, 3366, 1997, 2166, 2003, 2000, 2994, 4142, 102]

In [18]:
type(tokenizer)

transformers.models.bert.tokenization_bert_fast.BertTokenizerFast

In [19]:
tokenizer.convert_ids_to_tokens(encoded_text)

['[CLS]',
 'the',
 'essen',
 '##se',
 'of',
 'life',
 'is',
 'to',
 'stay',
 'alive',
 '[SEP]']

In [20]:
tokenizer(sample_text)

{'input_ids': [101, 1996, 29032, 3366, 1997, 2166, 2003, 2000, 2994, 4142, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

### Creating a model using bert

In [23]:
#### The model was downloaded using git as follows
#### git lfs install
#### git clone https://huggingface.co/bert-base-uncased

In [3]:
checkpoint='E:\\python_projects\\transformers\\bert-base-uncased'

In [4]:
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

In [5]:
tokenizer

PreTrainedTokenizerFast(name_or_path='E:\python_projects\transformers\bert-base-uncased', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [6]:
model=TFBertForSequenceClassification.from_pretrained(checkpoint)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at E:\python_projects\transformers\bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
input_text='This is a great way to learn the importance of giving'

In [8]:
tokens=tokenizer.tokenize(input_text)

In [9]:
input_ids=tokenizer.convert_tokens_to_ids(tokens)
input_ids

[2023, 2003, 1037, 2307, 2126, 2000, 4553, 1996, 5197, 1997, 3228]

In [10]:
ids=tf.convert_to_tensor([input_ids]) 
ids

<tf.Tensor: shape=(1, 11), dtype=int32, numpy=array([[2023, 2003, 1037, 2307, 2126, 2000, 4553, 1996, 5197, 1997, 3228]])>

In [11]:
output=model(ids)

In [12]:
output.logits

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.0118375 , 0.18537356]], dtype=float32)>

In [13]:
#the tokensizer returns a tensorflow temsor as a dictionary with the input ids, attention masks and the input type ids
input_tokens=tokenizer(input_text,return_tensors='tf')
input_tokens

{'input_ids': <tf.Tensor: shape=(1, 13), dtype=int32, numpy=
array([[ 101, 2023, 2003, 1037, 2307, 2126, 2000, 4553, 1996, 5197, 1997,
        3228,  102]])>, 'token_type_ids': <tf.Tensor: shape=(1, 13), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(1, 13), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>}

In [14]:
#Passing the input ids,attention mask and input type ids as **kwargs to the model class
model(**input_tokens).logits

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.28042442, -0.41061094]], dtype=float32)>

In [15]:
#alternate way of specifically passing the tokenizer keys
model(input_tokens['input_ids'],attention_mask=input_tokens['attention_mask']).logits

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.28042442, -0.41061094]], dtype=float32)>

## using distilbert

In [255]:
from transformers import DistilBertTokenizerFast,TFDistilBertForSequenceClassification

In [256]:
distil_tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [257]:
MAX_INPUT_LENGTH=512

In [268]:
def tokenize_text(text,truncation=True,padding=True,return_tensors='tf',is_split_into_words=True):
    return distil_tokenizer(text,truncation=truncation,padding=padding,return_tensors=return_tensors,is_split_into_words=is_split_into_words)


In [269]:
sample_text='to err is humane ,to forgive divine'

In [294]:
enoded_text=tokenize_text(sample_text,is_split_into_words=False)

In [295]:
enoded_text['input_ids']

<tf.Tensor: shape=(1, 11), dtype=int32, numpy=
array([[  101,  2000,  9413,  2099,  2003, 23369,  1010,  2000,  9641,
         7746,   102]])>

In [296]:
enoded_text['attention_mask']

<tf.Tensor: shape=(1, 11), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>

### Read from input file

In [193]:
sentiment_colls=['sentiment','tweet_id','tweet_date','query','username','tweet']
tweet_df=pd.read_csv(input_file_sentiment,encoding='cp1252',names=sentiment_colls,usecols=['sentiment','tweet'])

In [194]:
tweet_df

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [288]:
df_tokens=tokenize_text(tweet_df['tweet'][:1].tolist())

In [289]:
type(df_tokens)

transformers.tokenization_utils_base.BatchEncoding

In [290]:
df_tokens


{'input_ids': <tf.Tensor: shape=(1, 48), dtype=int32, numpy=
array([[  101,  1030,  6942, 13064,  8299,  1024,  1013,  1013,  1056,
         9148, 25856,  2594,  1012,  4012,  1013,  1016,  2100,  2487,
         2480,  2140,  1011, 22091,  2860,  2860,  1010,  2008,  1005,
         1055,  1037, 26352,  5017,  1012,  2017,  2323,  2050,  2288,
         2585, 12385,  1997,  2353,  2154,  2000,  2079,  2009,  1012,
         1025,  1040,   102]])>, 'attention_mask': <tf.Tensor: shape=(1, 48), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1]])>}

In [293]:
tokenize_text("this is a test",is_split_into_words=False)

{'input_ids': <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[ 101, 2023, 2003, 1037, 3231,  102]])>, 'attention_mask': <tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1]])>}

In [132]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading tf_model.h5', max=363423424, style=ProgressStyle(…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [133]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
_________________________________________________________________


In [134]:
model.compile(optimizer="adam",
             loss="binary_cross_entropy",
             metrics=["accuracy"])

In [None]:
X_train,X_test,y_train,y_test=train_test_split()