In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Layer

import numpy as np

2023-11-21 14:55:39.412421: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class Attention(Layer):
    def __init__(self, d_model):
        super(Attention, self).__init__()
        self.d_model = d_model
        self.dense_attention = MultiHeadAttention(num_heads=8, key_dim=d_model)
        self.dense_layernorm = LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        x, attention_mask = inputs
        attention_output = self.dense_attention(x, x, attention_mask=attention_mask)
        attention_output = self.dense_layernorm(attention_output + x)
        return attention_output

In [3]:
class BigBird(Model):
    def __init__(self, vocab_size, d_model, n_heads, num_layers, dff, rate=0.1):
        super(BigBird, self).__init__()
        self.embedding = Embedding(vocab_size, d_model)
        self.position_embedding = Embedding(5000, d_model)
        self.attention_layers = [Attention(d_model) for _ in range(num_layers)]
        self.ffn_layers = [Dense(dff, activation='relu') for _ in range(num_layers)]
        self.layernorm_layers = [LayerNormalization(epsilon=1e-6) for _ in range(num_layers)]
        self.fc = Dense(vocab_size)
        self.dropout = Dropout(rate)

    def call(self, inputs, training=False):
        x, attention_mask = inputs
        seq_len = tf.shape(x)[1]
        position_ids = tf.range(0, seq_len, delta=1, dtype=tf.int32)
        position_ids = tf.expand_dims(position_ids, axis=0)

        x = self.embedding(x)
        position_embedding = self.position_embedding(position_ids)
        x += position_embedding
        x = self.dropout(x, training=training)

        for i in range(len(self.attention_layers)):
            x = self.attention_layers[i]([x, attention_mask], training=training)
            x = self.ffn_layers[i](x, training=training)
            x = self.layernorm_layers[i](x + x, training=training)
            x = self.dropout(x, training=training)

        return self.fc(x)

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# from google.colab import files


# uploaded = files.upload()

In [8]:
import io

data = pd.read_csv('dataset.csv')
data

Unnamed: 0,speech,psychological_state
0,From the moment that the French defenses at Se...,dominance
1,We observe today not a victory of party but a ...,hope
2,"Your Majesties, Your Highnesses, Distinguished...",love
3,I am honored to be with you today at your comm...,love
4,"Honorable UN Secretary General Mr Ban Ki-moon,...",hope
5,It is with a profound sense of humility that I...,sentimentality
6,"My message is that we'll be watching you.\n\n""...",dispair
7,"Hello everybody. You know, Michelle and I have...",love
8,"Your Majesties, Your Royal Highness, Excellenc...",optimism
9,"Five score years ago, a great American, in who...",shame


In [9]:
data_f = data["speech"]
data_f

0    From the moment that the French defenses at Se...
1    We observe today not a victory of party but a ...
2    Your Majesties, Your Highnesses, Distinguished...
3    I am honored to be with you today at your comm...
4    Honorable UN Secretary General Mr Ban Ki-moon,...
5    It is with a profound sense of humility that I...
6    My message is that we'll be watching you.\n\n"...
7    Hello everybody. You know, Michelle and I have...
8    Your Majesties, Your Royal Highness, Excellenc...
9    Five score years ago, a great American, in who...
Name: speech, dtype: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data['speech'], data['psychological_state'], test_size=0.2, random_state=42)

In [13]:
X_train

5    It is with a profound sense of humility that I...
0    From the moment that the French defenses at Se...
7    Hello everybody. You know, Michelle and I have...
2    Your Majesties, Your Highnesses, Distinguished...
9    Five score years ago, a great American, in who...
4    Honorable UN Secretary General Mr Ban Ki-moon,...
3    I am honored to be with you today at your comm...
6    My message is that we'll be watching you.\n\n"...
Name: speech, dtype: object

In [14]:
X_test

8    Your Majesties, Your Royal Highness, Excellenc...
1    We observe today not a victory of party but a ...
Name: speech, dtype: object

In [12]:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [13]:
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# import string

# stop_words=set(stopwords.words('english'))
# def preprocess_speech(text):
#     text=text.lower()
#     text=''.join([word for word in text if word not in string.punctuation])
#     tokens=word_tokenize(text)
#     tokens=[word for word in tokens if word not in stop_words]
#     return ' '.join(tokens)

In [16]:
# Prepare your text data here and split it into training and validation sets.
# Assuming 'train_text' and 'val_text' are your text datasets.

# Create a Tokenizer instance
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert your text data into sequences
train_seq = tokenizer.texts_to_sequences(X_train)
val_seq = tokenizer.texts_to_sequences(X_test)

In [17]:
# Pad your sequences
train_padded = pad_sequences(train_seq, maxlen=4000, padding='post')
val_padded = pad_sequences(val_seq, maxlen=4000, padding='post')

In [18]:
# Create attention masks
def create_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    return mask[:, tf.newaxis]

train_mask = create_mask(train_padded)
val_mask = create_mask(val_padded)

In [23]:
train_padded

array([[  17,   11,   23, ...,    0,    0,    0],
       [  30,    2,  119, ...,    0,    0,    0],
       [1545,  782,   18, ...,    0,    0,    0],
       ...,
       [ 554,  306,  865, ...,    0,    0,    0],
       [  10,  106, 2160, ...,    0,    0,    0],
       [  32,  924,   11, ...,    0,    0,    0]], dtype=int32)

In [24]:
val_padded

array([[  66, 1633,   66, ...,    0,    0,    0],
       [   7, 1476,   59, ...,    0,    0,    0]], dtype=int32)

In [31]:
from sklearn.preprocessing import LabelEncoder

# Sample non-numerical labels
original_labels = data['psychological_state']

# Use LabelEncoder to convert non-numerical labels to numerical labels
label_encoder = LabelEncoder()
numerical_labels = label_encoder.fit_transform(original_labels)

print("Original labels:", original_labels)
print("Numerical labels:", numerical_labels)


Original labels: 0         dominance
1              hope
2              love
3              love
4              hope
5    sentimentality
6           dispair
7              love
8          optimism
9             shame
Name: psychological_state, dtype: object
Numerical labels: [1 2 3 3 2 5 0 3 4 6]


In [32]:
X_train, X_test, y_train, y_test = train_test_split(data['speech'], numerical_labels, test_size=0.2, random_state=42)

In [33]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

5    It is with a profound sense of humility that I...
0    From the moment that the French defenses at Se...
7    Hello everybody. You know, Michelle and I have...
2    Your Majesties, Your Highnesses, Distinguished...
9    Five score years ago, a great American, in who...
4    Honorable UN Secretary General Mr Ban Ki-moon,...
3    I am honored to be with you today at your comm...
6    My message is that we'll be watching you.\n\n"...
Name: speech, dtype: object
8    Your Majesties, Your Royal Highness, Excellenc...
1    We observe today not a victory of party but a ...
Name: speech, dtype: object
[5 1 3 3 6 2 3 0]
[4 2]


In [25]:
vocab_size = 5000  # Assuming you have a vocabulary of 5000 words
d_model = 128  # Adjust this based on your requirements
n_heads = 8
num_layers = 4
dff = 512
rate = 0.1

bigbird_model = BigBird(vocab_size, d_model, n_heads, num_layers, dff, rate)


In [26]:
bigbird_model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [35]:
# Assuming you have labels for your training data in train_labels
bigbird_model.fit(
    x=[train_padded, train_mask],
    y=y_train,
    epochs=10,  # Adjust the number of epochs as needed
    validation_data=([val_padded, val_mask], y_test)
)


Epoch 1/10


ResourceExhaustedError: ignored

In [None]:
predictions = bigbird_model.predict([val_padded, create_mask(val_padded)])


In [None]:
eval_results = bigbird_model.evaluate([val_padded, create_mask(val_padded)], val_labels)
print("Evaluation results:", eval_results)


In [None]:
# Use BigBird model to generate word embeddings
# word_embeddings = bigbird.predict([train_padded, train_mask])