In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

In [None]:
df=pd.read_csv('one_output_dataset.csv')
df

In [46]:
dict={'anger':1.0, 'joy':2.0, 'trust':3.0, 'anticipation':4.0, 'sadness':5.0, 'fear':6.0}
df=df.replace(dict)
df['emotion'] = df['emotion'].apply(pd.to_numeric)
df

Unnamed: 0,speech,emotion
0,moment french defens sedan meus broken end sec...,1.0
1,observ today victori parti celebr freedomsymbo...,2.0
2,majesti high distinguish guest comrad friend t...,3.0
3,honor today commenc one finest univers world n...,4.0
4,honor un secretari gener mr ban kimoon respect...,5.0
5,profound sens humil accept honor chosen bestow...,5.0
6,messag well watch wrong shouldnt back school s...,6.0
7,hello everybodi know michel realli milk goodby...,2.0
8,majesti royal high excel distinguish member no...,4.0
9,five score year ago great american whose symbo...,3.0


# New Section

In [47]:
#preprocess transcript

# download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# initialize a PorterStemmer
stemmer = PorterStemmer()

def preprocess_text(text):
    # lowercase the text
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # tokenize the text
    words = word_tokenize(text)
    # remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in stopwords.words('english')]
    # join the words back into a string
    text = ' '.join(words)

    return text

df['speech'] = df['speech'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
!pip install sentencepiece



In [49]:
tokenizer=AlbertTokenizer.from_pretrained('albert-large-v2')

model=TFAlbertForSequenceClassification.from_pretrained('albert-large-v2',num_labels=6)

All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [51]:
#split in train and test in text and labels
train_texts = train['speech'].tolist()
train_labels = train['emotion'].tolist()

test_texts = test['speech'].tolist()
test_labels = test['emotion'].tolist()

In [52]:
#tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [53]:
#convert features and labels to tensors for both train and test
train_features = {key: tf.convert_to_tensor(val) for key, val in train_encodings.items()}
train_labels = tf.convert_to_tensor(train_labels)

test_features = {key: tf.convert_to_tensor(val) for key, val in test_encodings.items()}
test_labels = tf.convert_to_tensor(test_labels)

In [54]:
#prepare the training and testing dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
train_dataset = train_dataset.shuffle(10000).batch(1)

test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))
test_dataset = test_dataset.batch(1)

In [55]:
#define model metrics
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

In [56]:
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [57]:
model.summary()

Model: "tf_albert_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  17683968  
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  6150      
                                                                 
Total params: 17690118 (67.48 MB)
Trainable params: 17690118 (67.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [58]:
train_labels

<tf.Tensor: shape=(8,), dtype=float32, numpy=array([5., 2., 4., 1., 3., 5., 4., 2.], dtype=float32)>

In [59]:
#fit model to train dataset
model.fit(train_dataset, epochs=3, validation_data=test_dataset)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x796f49a924a0>

In [60]:
model.save('first_model.keras')



In [61]:
df['speech'][0]

'moment french defen sedan meu broken end second week may rapid retreat amien south could save british french armi enter belgium appeal belgian king strateg fact immedi realiz french high command hope would abl close gap armi north order moreov retir kind would involv almost certainli destruct fine belgian armi 20 divi abandon whole belgium therefor forc scope german penetr realiz new french generalissimo gener weygand assum command place gener gamelin effort made french british armi belgium keep hold right hand belgian give right hand newli creat french armi advanc across somm great strength grasp howev german erupt swept like sharp scyth around right rear armi north eight nine armor divi four hundr armor vehicl differ kind care assort complementari divi small selfcontain unit cut commun us main french armi sever commun food ammunit ran first amien afterward abbevil shore way coast boulogn calai almost dunkirk behind armor mechan onslaught came number german divi lorri behind plod com

In [62]:
new_feature = tokenizer(df['speech'][0], truncation=True, padding=True, return_tensors='tf')

In [63]:
predictions = model(new_feature)

In [64]:
predictions

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[ 0.02588189,  0.24483255,  0.11712265,  0.04261102, -0.2818564 ,
         0.56155694]], dtype=float32)>, hidden_states=None, attentions=None)

In [None]:
model.output_shape

AttributeError: ignored