In [2]:
#! pip install transformers

In [12]:
import pandas as pd 
import numpy as np 
from tqdm.auto import tqdm 
import tensorflow as tf 
from transformers import BertTokenizer 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import re
import nltk
from sklearn.preprocessing import LabelEncoder

In [5]:
data=pd.read_csv('tweet_emotions.csv')
data

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [7]:
data['sentiment'].nunique()

13

In [9]:
nltk.download('stopwords')
STOPWORDS = stopwords.words("english")
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def clean(text):
    text = text.lower() 
    text=re.sub(r'@\w+',"", text)
    text = re.sub("[^\w\s]"," ",text) # Remove punctuations 
    text = " ".join(w for w in text.split() if w not in STOPWORDS)
    text = " ".join(lemmatizer.lemmatize(w) for w in text.split())
    return text
data['text'] = data['content'].apply(lambda x : clean(x)) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SiYu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SiYu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
data

Unnamed: 0,tweet_id,sentiment,content,text
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,know listenin bad habit earlier started freaki...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhh waitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,want hang friend soon
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,want trade someone houston ticket one
...,...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor,
39996,1753919001,love,Happy Mothers Day All my love,happy mother day love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...,happy mother day mommy woman man long momma so...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,wassup beautiful follow peep new hit single ww...


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
 3   text       40000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.2+ MB


In [15]:
Le = LabelEncoder()
y =  Le.fit_transform(np.array(data['sentiment']))
data['Y_Encoder']=y

In [19]:
data=data[['text','Y_Encoder']]

In [22]:
data

Unnamed: 0,text,Y_Encoder
0,know listenin bad habit earlier started freaki...,2
1,layin n bed headache ughhhh waitin call,10
2,funeral ceremony gloomy friday,10
3,want hang friend soon,3
4,want trade someone houston ticket one,8
...,...,...
39995,,8
39996,happy mother day love,7
39997,happy mother day mommy woman man long momma so...,7
39998,wassup beautiful follow peep new hit single ww...,5


In [26]:
data['Y_Encoder'].value_counts()

8     8638
12    8459
5     5209
10    5165
7     3842
11    2187
4     1776
9     1526
6     1323
2      827
3      759
1      179
0      110
Name: Y_Encoder, dtype: int64

In [24]:
#Oversample
X=data['text'].array
y=data['Y_Encoder']

from imblearn.over_sampling import SMOTEN
sampler = SMOTEN(random_state=0)
X_res, y_res = sampler.fit_resample(X.reshape(-1, 1), y)

In [33]:
X_res.flatten()

array(['know listenin bad habit earlier started freakin part',
       'layin n bed headache ughhhh waitin call',
       'funeral ceremony gloomy friday', ...,
       'darn craving wedding cake craving hard satisfy anything else',
       'darn craving wedding cake craving hard satisfy anything else',
       'darn craving wedding cake craving hard satisfy anything else'],
      dtype=object)

In [39]:
data=pd.DataFrame({"text":X_res.flatten(),'label':y_res})
data

Unnamed: 0,text,label
0,know listenin bad habit earlier started freaki...,2
1,layin n bed headache ughhhh waitin call,10
2,funeral ceremony gloomy friday,10
3,want hang friend soon,3
4,want trade someone houston ticket one,8
...,...,...
112289,darn craving wedding cake craving hard satisfy...,12
112290,darn craving wedding cake craving hard satisfy...,12
112291,darn craving wedding cake craving hard satisfy...,12
112292,darn craving wedding cake craving hard satisfy...,12


In [27]:
tokenizer= BertTokenizer.from_pretrained('bert-base-cased')

Downloading: 100%|██████████| 213k/213k [00:00<00:00, 278kB/s]  
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 14.3kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 287kB/s]


In [85]:
token = tokenizer.encode_plus(
    data['text'].iloc[0],
    max_length=256,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors='tf'
)

In [86]:
X_input_ids=np.zeros((len(data),256))
X_attn_masks=np.zeros((len(data),256))

In [87]:
X_input_ids.shape

(112294, 256)

In [88]:
def generate_training_data(df,ids,masks,tokenizer):
    for i , text in tqdm(enumerate(df['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i,:]=tokenized_text.input_ids
        masks[i,:]=tokenized_text.attention_mask
    return ids,masks

In [89]:
X_input_ids,X_attn_masks=generate_training_data(data,X_input_ids,X_attn_masks,tokenizer)

112294it [00:51, 2196.21it/s]


In [90]:
X_input_ids

array([[ 101., 1221., 5113., ...,    0.,    0.,    0.],
       [ 101., 3191., 1394., ...,    0.,    0.,    0.],
       [ 101., 6594., 4502., ...,    0.,    0.,    0.],
       ...,
       [ 101., 5358., 4558., ...,    0.,    0.,    0.],
       [ 101., 5358., 4558., ...,    0.,    0.,    0.],
       [ 101., 5358., 4558., ...,    0.,    0.,    0.]])

In [91]:
labels = np.zeros((len(data),13))
labels.shape

(112294, 13)

In [92]:
labels[np.arange(len(data)),data["label"].values]=1
labels

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [93]:
dataset=tf.data.Dataset.from_tensor_slices((X_input_ids,X_attn_masks,labels))

In [94]:
def SentimentDatasetMapFunction(input_ids,attn_masks,labels):
    return {
        "input_ids":input_ids,
        "attention_masks":attn_masks,
    },labels

dataset=dataset.map(SentimentDatasetMapFunction)

In [95]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_masks': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(13,), dtype=tf.float64, name=None))>

In [96]:
dataset=dataset.shuffle(10000).batch(20,drop_remainder=True)

In [97]:
p=0.8 #training 0.8
train_size=int((len(data)//16)*p)
train_size

5614

In [98]:
train_dataset=dataset.take(train_size)
val_dataset=dataset.skip(train_size)

In [99]:
from transformers import TFBertModel 
bert_model=TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [100]:
input_ids=tf.keras.layers.Input(shape=(256,),name='input_ids',dtype='int32')
attention_masks=tf.keras.layers.Input(shape=(256,),name='attention_masks',dtype='int32')

bert_embds=bert_model.bert(input_ids,attention_masks)[1]
intermediate_layer=tf.keras.layers.Dense(512,activation='relu',name='intermediate_layer')(bert_embds)
output_layer=tf.keras.layers.Dense(13,activation='softmax',name='output')(intermediate_layer)


model=tf.keras.Model(inputs=[input_ids,attention_masks],outputs=output_layer)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_masks[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [102]:
optim=tf.keras.optimizers.Adam(learning_rate=1e-5,decay=1e-6)
loss_fun=tf.keras.losses.CategoricalCrossentropy()
acc=tf.keras.metrics.CategoricalAccuracy('accuracy')

In [103]:
model.compile(optimizer=optim,loss=loss_fun,metrics=[acc])

In [104]:
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
  76/5614 [..............................] - ETA: 28:23:39 - loss: 2.0842 - accuracy: 0.3026