In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load datasets
train_df = pd.read_csv('/Users/shreya/Desktop/capstone/Emotion_text/training.csv')
test_df = pd.read_csv('/Users/shreya/Desktop/capstone/Emotion_text/test.csv')
validation_df = pd.read_csv('/Users/shreya/Desktop/capstone/Emotion_text/validation.csv')

In [3]:
# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['text'])

In [4]:
# Convert text data to sequences
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])
X_val = tokenizer.texts_to_sequences(validation_df['text'])

In [5]:
# Pad sequences to ensure uniform length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')
X_val = pad_sequences(X_val, maxlen=maxlen, padding='post')

In [6]:
# Define labels
y_train = train_df['label']
y_test = test_df['label']
y_val = validation_df['label']

In [7]:
from tensorflow.keras.layers import Bidirectional, GlobalMaxPooling1D

# Define model architecture
embedding_dim = 100
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, input_length=maxlen))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [8]:
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [9]:
print(train_df['label'].unique())

[0 3 2 5 4 1]


In [10]:
# Filter out rows with invalid label values
train_df = train_df[train_df['label'].isin([0, 1, 2, 3, 4])]

In [11]:
print(train_df['label'].unique())


[0 3 2 4 1]


In [12]:
print(test_df['label'].unique())

[0 1 4 3 2 5]


In [13]:
# Filter out rows with invalid label values
test_df = test_df[test_df['label'].isin([0, 1, 2, 3, 4])]

In [14]:
# Convert text data to sequences
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])
X_val = tokenizer.texts_to_sequences(validation_df['text'])

# Pad sequences to ensure uniform length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')
X_val = pad_sequences(X_val, maxlen=maxlen, padding='post')

# Define labels
y_train = train_df['label']
y_test = test_df['label']
y_val = validation_df['label']

In [15]:
# Convert text data to sequences
X_test = tokenizer.texts_to_sequences(test_df['text'])
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')

# Define labels
y_test = test_df['label']


In [16]:
print(validation_df['label'].unique())

[0 2 3 1 4 5]


In [17]:
# Filter out rows with invalid label values
validation_df = validation_df[validation_df['label'].isin([0, 1, 2, 3, 4])]

In [18]:
# Convert text data to sequences
X_val = tokenizer.texts_to_sequences(validation_df['text'])
X_val = pad_sequences(X_val, maxlen=maxlen, padding='post')

# Define labels for validation dataset
y_val = validation_df['label']

In [19]:
# Train model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [20]:
# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9415718913078308


In [21]:
# Save the model to an HDF5 file
model.save('modelET.h5')

  saving_api.save_model(
