In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report


In [57]:
test_df = pd.read_csv('../data/test_nolabel.csv')
train_df = pd.read_csv('../data/train.csv')
val_df = pd.read_csv('../data/val.csv')

In [58]:
X_train = train_df['text'].values
y_train = train_df['label'].values
X_val = val_df['text'].values
y_val = val_df['label'].values

In [59]:
print(np.unique(y_train)) 
print(np.unique(y_val)) 

[1 2 3 4 5]
[1 2 3 4 5]


In [60]:
y_train_shifted = y_train - 1
y_val_shifted = y_val - 1

In [61]:
max_words = 10000  # maximum number of words to consider
max_len = 100      # maximum length of the sequences (in words)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [62]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

In [63]:
num_classes = len(np.unique(y_train_shifted))  # Number of unique classes after shifting (which will be 5)
y_train_cat = to_categorical(y_train_shifted, num_classes=num_classes)
y_val_cat = to_categorical(y_val_shifted, num_classes=num_classes)

In [64]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# If you prefer Bi-LSTM, use the following layer instead:
# model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(num_classes, activation='softmax'))



In [65]:
model.compile(loss='categorical_crossentropy', 
              optimizer=Adam(learning_rate=0.001), 
              metrics=['accuracy'])

In [66]:
history = model.fit(X_train_pad, y_train_cat, epochs=5, batch_size=64, validation_data=(X_val_pad, y_val_cat))

Epoch 1/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 108ms/step - accuracy: 0.2573 - loss: 1.5742 - val_accuracy: 0.3510 - val_loss: 1.4512
Epoch 2/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 103ms/step - accuracy: 0.4500 - loss: 1.3038 - val_accuracy: 0.4049 - val_loss: 1.3690
Epoch 3/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 104ms/step - accuracy: 0.6163 - loss: 0.9438 - val_accuracy: 0.3853 - val_loss: 1.4739
Epoch 4/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 102ms/step - accuracy: 0.7756 - loss: 0.6340 - val_accuracy: 0.3695 - val_loss: 1.8348
Epoch 5/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 102ms/step - accuracy: 0.8552 - loss: 0.4246 - val_accuracy: 0.3594 - val_loss: 2.0647


In [67]:
X_test = test_df['text'].values
X_id = test_df['id'].values
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [68]:
y_pred = model.predict(X_test_pad)

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step


In [69]:
y_pred_labels = np.argmax(y_pred, axis=1)

In [70]:
y_pred_labels_shifted = y_pred_labels + 1

In [71]:
predictions_df = pd.DataFrame({
    'id':X_id,
    'label': y_pred_labels_shifted
})

In [72]:
predictions_df.to_csv('predictions.csv', index=False)

In [73]:
if 'label' in test_df.columns:
    y_test = test_df['label'].values
    print("Accuracy on Test Data: ", accuracy_score(y_test, y_pred_labels_shifted))
    print("Classification Report: \n", classification_report(y_test, y_pred_labels_shifted))

In [74]:
print("Predicted Sentiment Labels: ", y_pred_labels_shifted)

Predicted Sentiment Labels:  [4 4 3 ... 4 4 4]
