#BERT

In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install sklearn
!pip install tensorflow
!pip install transformers
!pip install nltk

In [6]:
pip install tensorflow-text

### Import the required libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import nltk
import warnings
warnings.simplefilter("ignore")

### Read and Inspect the Data

In [3]:
ed = pd.read_csv('/Users/izabellamartirosyan/Desktop/tweet_emotions.csv')


In [None]:
ed.head()

In [None]:
ed.shape

In [None]:
ed.info()

There are 40000 rows and 3 columns in the dataset. For text classification, we are interested in the content and sentiment column.

### Exploratory Data Analysis

From inspection, we see there is no missing value in the dataset. Let's check for the duplicate values.

In [None]:
ed.duplicated().sum()

There are no duplicates and there can't be outliers since both are text columns.

In [None]:
ed1 = ed.copy() #make a seperate copy of the original dataframe

In [None]:
#Check for the unique values in the sentiment column
ed1.sentiment.unique()

In [None]:
ed1.sentiment.nunique()

In [None]:
ed1.sentiment.value_counts()

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(y=ed1.sentiment,data = ed1,order = ed1.sentiment.value_counts().index)

Let's group together hate and anger under the label hate/anger and empty and boredom under the label empty/bore.

In [None]:
def labels(sentiment):
    if sentiment=='hate' or sentiment == 'anger':
        return 'hate/anger'
    elif sentiment == 'empty' or sentiment ==  'boredom':
        return 'empty/boredom'
    else:
        return sentiment

ed1['sentiment'] = ed1.apply(lambda x: labels(x['sentiment']),axis=1)

In [None]:
ed1.sentiment.value_counts()

In [None]:
#check random tweet from content series
ed1.content[99]

In [None]:
ed1.sentiment[99]

In [None]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
ed1['tokenize_content'] = ed1.content.str.lower().apply(tt.tokenize)

In [None]:
ed1.tokenize_content[99]

In [None]:
#nltk.download('wordnet')

In [None]:
from nltk.stem import WordNetLemmatizer 
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]
    
ed1['tokenize_lemmatized_content'] = ed1['tokenize_content'].apply(lemmatize_text)

In [None]:
ed1.tokenize_lemmatized_content[99]

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
ed1['text']=ed1.tokenize_lemmatized_content.apply(lambda x: [item for item in x if item not in stop])

In [None]:
ed1.text[99]

In [None]:
ed1.head()

In [None]:
ed1.info()

In [None]:
ed1.text

In [None]:
ed1['text'] = ed1['text'].apply(lambda x: str(x).replace(',','').lstrip('[').rstrip(']')).str.replace("'","")

In [None]:
ed2 = ed1[['text','sentiment']].copy()

In [None]:
ed2.head()

In [None]:
ed2.shape

### Split into Training and Test dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train_unenc, y_test_unenc = train_test_split(\
    ed2.text,ed2.sentiment,test_size = 0.35,stratify = ed1.sentiment )

In [None]:
print(X_train.shape,y_train_unenc.shape,X_test.shape,y_test_unenc.shape)

In [None]:
X_train.head()

In [None]:
y_train_unenc.head()

In [None]:
y_train_unenc.value_counts()

In [None]:
y_test_unenc.value_counts()

In [None]:
print(5615/y_train_unenc.shape[0],3023/y_test_unenc.shape[0])

In [None]:
print(493/y_train_unenc.shape[0],266/y_test_unenc.shape[0])

##### Preprocessing the y_train_unenc and y_test_unenc

There 11 type of emotions present in the sentiment column. For feeding it into deep neural network, we will first convert them into labels and then encode it using one hot encoders.

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
y_train_enc = label_encoder.fit_transform(y_train_unenc)
y_test_enc = label_encoder.fit_transform(y_test_unenc)

In [None]:
print(np.unique(y_train_enc))

In [None]:
y_train_enc

Label encoder has assigned integer value to each sentiment.

In [None]:
#Converting labels into one hot enocders
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train_enc)
y_test= to_categorical(y_test_enc)

In [None]:
y_train

In [None]:
y_test

##### Import Bert PreProcessor and Bert Encoder

In [None]:
from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

In [None]:
max_len = 70

In [None]:
# Tokenize the input (takes some time) 
# here tokenizer using from bert-base-cased
x_train = tokenizer(
    text=X_train.tolist(),
    add_special_tokens=True,
    max_length = max_len,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
x_test = tokenizer(
    text=X_test.tolist(),
    add_special_tokens=True,
    max_length = max_len,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
input_ids = x_train['input_ids']
attention_mask = x_train['attention_mask']

### Model Building

In [None]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

In [None]:
# Bert layers
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(64,activation = 'relu')(out)
y = Dense(11,activation = 'softmax')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [None]:
model.summary()

In [None]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy("balanced_accuracy"),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [None]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = y_train,
    validation_data = (
    {'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, y_test
    ),
  epochs=1,
  batch_size = 128
)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y_predicted = model.predict(X_test)

In [None]:
y_predict=np.argmax(y_predicted, axis = 1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test_enc, y_predict))

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test_enc, y_predict)

In [None]:
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')