In [None]:
# !pip install transformers

In [None]:
# Basics
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import datetime

# Tensorflow
import tensorflow as tf
import tensorflow_datasets as tfds

# BERT
from transformers import BertTokenizer, TFBertForSequenceClassification

# Model Selection and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_csv('Tweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

# Check the airline_sentiment Column Values

In [None]:
df['airline_sentiment'].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [None]:
df['airline_sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

# Convert Text to Tokens

In [None]:
df['airline_sentiment'] = df['airline_sentiment'].astype('category').cat.codes
df['airline_sentiment'].value_counts()

0    9178
1    3099
2    2363
Name: airline_sentiment, dtype: int64

# Split Whole Dataset to Train and Test Data 

In [None]:
X = df['text']
y = df['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42, stratify = y)

In [None]:
train_data = pd.DataFrame({'Text': X_train, 'Sentiment': y_train}).reset_index().drop('index', axis = 1)
test_data = pd.DataFrame({'Text': X_test, 'Sentiment': y_test}).reset_index().drop('index', axis = 1)

In [None]:
train_data
train_data.head()

Unnamed: 0,Text,Sentiment
0,@USAirways trying to check in online for a fli...,0
1,@united Another awful experience and Victoria ...,0
2,@USAirways no problem...just funny have a nice...,1
3,@AmericanAir many have missed connections alr...,0
4,@JetBlue I usually do-but I didn't make the Fl...,1


In [None]:
test_data.head()

Unnamed: 0,Text,Sentiment
0,@USAirways did you know that suicide is the se...,1
1,@USAirways flt last nght Cancelled Flighted-me...,0
2,LMAO “@JetBlue: Our fleet's on fleek. http://t...,2
3,@united mine is GJQX6J husband is A587CW (Can'...,1
4,"@VirginAmerica has getaway deals through May, ...",1


In [None]:
train_data.to_csv('airline_train_data.csv', index = False)
test_data.to_csv('airline_test_data.csv', index = False)

# Notes

[' '] is for string value

[0] is for numerical (int64) value

In [None]:
train_defaults = [tf.constant([''], dtype = tf.string), tf.constant([0], dtype = tf.int64)]
test_defaults = [tf.constant([''], dtype = tf.string), tf.constant([0], dtype = tf.int64)]

ds_train = tf.data.experimental.CsvDataset(['airline_train_data.csv'], record_defaults = train_defaults, header = True)
ds_test = tf.data.experimental.CsvDataset(['airline_test_data.csv'], record_defaults = test_defaults, header = True)

Possible values of sentiments:
1. Negative
2. Neutral
3. Positive

In [None]:
sentiments = ['Negative', 'Neutral', 'Positive']

In [None]:
def view_ds_data (ds_data, number_of_rows):
  for text, sentiment in enumerate(ds_data.take(number_of_rows)):
    print(text, sentiment)

In [None]:
view_ds_data(ds_train, 10)

0 (<tf.Tensor: shape=(), dtype=string, numpy=b'@USAirways trying to check in online for a flight tomorrow.  Tried computer, tablet, and phone.  got error message on all devices?'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
1 (<tf.Tensor: shape=(), dtype=string, numpy=b'@united Another awful experience and Victoria at the check in desk in STL could not have been more rude and condescending.'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
2 (<tf.Tensor: shape=(), dtype=string, numpy=b'@USAirways no problem...just funny have a nice day'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
3 (<tf.Tensor: shape=(), dtype=string, numpy=b'@AmericanAir  many have missed connections already b/c of delayed flight which will finally board soon'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
4 (<tf.Tensor: shape=(), dtype=string, numpy=b"@JetBlue I usually do-but I didn't make the Flight Booking Problems this time-that'll teach me! Yea I have that going for me at least haha">, <tf.Tensor: shape=(), d

In [None]:
view_ds_data(ds_test, 10)

0 (<tf.Tensor: shape=(), dtype=string, numpy=b'@USAirways did you know that suicide is the second leading cause of death among teens 10-24'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
1 (<tf.Tensor: shape=(), dtype=string, numpy=b"@USAirways flt last nght Cancelled Flighted-mech.probs, flt this am Cancelled Flighted-snow, finally on a flt home. 1st cls empty &amp; they won't let me sit there">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
2 (<tf.Tensor: shape=(), dtype=string, numpy=b"LMAO \xe2\x80\x9c@JetBlue: Our fleet's on fleek. http://t.co/aIyC9WV5oq\xe2\x80\x9d">, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
3 (<tf.Tensor: shape=(), dtype=string, numpy=b"@united mine is GJQX6J husband is A587CW (Can't DM for some reason.)">, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
4 (<tf.Tensor: shape=(), dtype=string, numpy=b'@VirginAmerica has getaway deals through May, from $59 one-way. Lots of cool cities http://t.co/QDlJHslOI5 #CheapFlights #FareCompare'>, <tf.Tensor: shape=(), dtype

# Installing Tokenzier

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-lite-base-p2')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.


In [None]:
bert_tokenizer

PreTrainedTokenizer(name_or_path='indobenchmark/indobert-lite-base-p2', vocab_size=29999, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
test_sentence = 'Saya mau pergi ke Mars!'

In [None]:
bert_input = bert_tokenizer.encode_plus(test_sentence, max_length = 25, add_special_tokens = True,
                                        pad_to_max_length = True, return_attention_mask = True, truncation = True)

bert_input



{'input_ids': [2, 209, 422, 1821, 43, 11499, 29935, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

# Preprocessing Functions

In [None]:
# Returns input_ids, attention_masks, token_type_ids, labels
def get_features (sentence):
    features = bert_tokenizer.encode_plus(sentence, max_length = 100, add_special_tokens = True,
                                    pad_to_max_length = True, return_attetion_mask = True, truncation = True)
    return features
  
def get_features_dictionary (input_ids, attention_masks, token_type_ids, labels):
    features_dict = {'input_ids': input_ids,
                   'attention_mask': attention_masks,
                   'token_type_ids': token_type_ids}
    return features_dict, labels

def encode_sentence (data):
    input_ids_list, attention_mask_list, token_type_ids_list, labels_list = [], [], [], []

    for text, sentiment in tfds.as_numpy(data):
        bert_input = get_features(text.decode())
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        labels_list.append(sentiment)
    
    # combined_features = (input_ids_list, attention_masks_list, token_type_ids_list, labels)
    # combined_features = [input_ids_list, attention_masks_list, token_type_ids_list, labels]
    # combined_features_df = tf.data.Dataset.from_tensor_slices(combined_features)
    # return combined_features_df.map(get_features_dictionary)
    tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, labels)).map(get_features_dictionary)

# Shuffle and Batch

In [None]:
batch_size = 50 # controls the number of training samples to work before updating weight of neurons
suffle_buffer_size = 1000

# Batching Example

In [None]:
ex = tf.data.Dataset.range(30)
print(list(ex.as_numpy_iterator()))
ex = ex.batch(4)
print(list(ex.as_numpy_iterator()))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[array([0, 1, 2, 3]), array([4, 5, 6, 7]), array([ 8,  9, 10, 11]), array([12, 13, 14, 15]), array([16, 17, 18, 19]), array([20, 21, 22, 23]), array([24, 25, 26, 27]), array([28, 29])]


In [None]:
encoded_ds_train = encode_sentence(ds_train).shuffle(shuffle_buffer_size).batch(batch_size)

Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'return_attetion_mask': True} not recognized.
Keyword arguments {'retur

KeyboardInterrupt: ignored

In [None]:
encoded_ds_test = encode_sentence(ds_test).batch(batch_size)