<a href="https://colab.research.google.com/github/YashDhiman02/NLP_Assignment2_Embeddings/blob/main/21317_yashdhiman_nlpassignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Below are 4 programs, first three of which are separated by ''' ''' comments.
# First Commented out code learns embeddings from the training data
# Second Commented out code runs FFNN model
# Third Commented out code tuns LSTM model
# Fourth code runs the RNN model(it gives best F1 score).

'''
#Training embeddings

import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_url = 'https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/train_split.csv'
data = pd.read_csv(train_url)

sentences = data['text'].astype(str).tolist()

tokenized_sentences = [gensim.utils.simple_preprocess(sentence) for sentence in sentences]

embedding_size = 100  # Set less than 100 as required
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=embedding_size, window=5, min_count=1, workers=4)

word2vec_model.save("custom_word2vec.model")

word2vec_model.wv.save_word2vec_format("custom_word2vec_embeddings.txt", binary=False)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

vocab_size = len(tokenizer.word_index) + 1  # +1 because of reserved index 0
embedding_matrix = np.zeros((vocab_size, embedding_size))

for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

print("Word2Vec model trained and embeddings saved.")
'''


'''
#FFNN

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam, AdamW, SGD
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from gensim.models import Word2Vec

embedding_url = 'https://raw.githubusercontent.com/YashDhiman02/NLP_Assignment2_Embeddings/main/custom_word2vec.model'
word2vec_model = Word2Vec.load(embedding_url)

train_url = 'https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/train_split.csv'
train_data = pd.read_csv(train_url)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 80
MAX_NB_WORDS = 20000
EMBEDDING_DIM = word2vec_model.vector_size

# Tokenize the text data
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train_data['text'])
X = tokenizer.texts_to_sequences(train_data['text'])

# Pad the sequences to a fixed length (128 tokens)
X_padded = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

# Extract labels (emotion columns: Joy, Fear, Anger, Sadness, Surprise)
y = train_data[['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise']].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

VOCAB_SIZE = min(MAX_NB_WORDS, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    if i < VOCAB_SIZE and word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

model = Sequential()

model.add(Embedding(input_dim=VOCAB_SIZE,
                    output_dim=EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))  # Set trainable=False to freeze the Word2Vec embeddings

model.add(Flatten())

model.add(Dense(50, activation='relu'))  # First hidden layer
model.add(Dense(50, activation='relu'))  # Second hidden layer

model.add(Dense(5, activation='sigmoid'))  # Sigmoid for multi-label classification

learning_rate = 0.001
optimizer = AdamW(learning_rate=learning_rate)  # Choose the optimizer (AdamW used here)

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=60, batch_size=20, validation_data=(X_val, y_val), verbose=0)

val_loss, val_accuracy = model.evaluate(X_val, y_val, batch_size=20,verbose=0)
#print(f'Validation Loss: {val_loss}')
#print(f'Validation Accuracy: {val_accuracy}')

test_url = 'https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/test_split.csv'
test_data = pd.read_csv(test_url)

X_test = tokenizer.texts_to_sequences(test_data['text'])
X_test_padded = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

y_test = test_data[['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise']].values

y_pred = model.predict(X_test_padded)

y_pred_bin = (y_pred > 0.5).astype(int)

f1 = f1_score(y_test, y_pred_bin, average='macro')

print(f'Average Macro F1 Score: {f1}')

'''



'''
#LSTM

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam, AdamW, SGD
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

train_url = 'https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/train_split.csv'
train_data = pd.read_csv(train_url)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 128  # Maximum sequence length (tokens)
MAX_NB_WORDS = 10000  # Maximum number of words in the vocabulary (you can adjust)
EMBEDDING_DIM = 100  # Size of embedding vectors (could also be trained)
VOCAB_SIZE = MAX_NB_WORDS

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train_data['text'])
X = tokenizer.texts_to_sequences(train_data['text'])

X_padded = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

y = train_data[['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise']].values

X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

embedding_url = 'https://raw.githubusercontent.com/YashDhiman02/NLP_Assignment2_Embeddings/main/custom_word2vec.model'
word2vec_model = Word2Vec.load(embedding_url)

embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
word_index = tokenizer.word_index

for word, i in word_index.items():
    if i < VOCAB_SIZE and word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

model = Sequential()

model.add(Embedding(input_dim=VOCAB_SIZE,
                    output_dim=EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))

model.add(LSTM(64, activation='tanh', return_sequences=True))
model.add(LSTM(64, activation='tanh'))
model.add(Dense(64, activation='relu'))

model.add(Dense(5, activation='sigmoid'))  # Sigmoid for multi-label classification

learning_rate = 0.001
optimizer = AdamW(learning_rate=learning_rate)

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=40, batch_size=20, validation_data=(X_val, y_val),verbose=0)


val_loss, val_accuracy = model.evaluate(X_val, y_val, batch_size=20)
#print(f'Validation Loss: {val_loss}')
#print(f'Validation Accuracy: {val_accuracy}')

test_url = 'https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/test_split.csv'
test_data = pd.read_csv(test_url)

X_test = tokenizer.texts_to_sequences(test_data['text'])
X_test_padded = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

y_test = test_data[['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise']].values

y_pred = model.predict(X_test_padded)

y_pred_bin = (y_pred > 0.5).astype(int)

f1 = f1_score(y_test, y_pred_bin, average='macro')
print(f'Average Macro F1 Score: {f1}')
'''

#RNN

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam, AdamW, SGD
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from gensim.models import Word2Vec

import random
import os
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

embedding_url = 'https://raw.githubusercontent.com/YashDhiman02/NLP_Assignment2_Embeddings/main/custom_word2vec.model'
word2vec_model = Word2Vec.load(embedding_url)

train_url = 'https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/train_split.csv'
train_data = pd.read_csv(train_url)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 124
MAX_NB_WORDS = 10000
EMBEDDING_DIM = word2vec_model.vector_size

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train_data['text'])
X = tokenizer.texts_to_sequences(train_data['text'])

X_padded = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

y = train_data[['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise']].values

X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

VOCAB_SIZE = min(MAX_NB_WORDS, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    if i < VOCAB_SIZE and word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

model = Sequential()

model.add(Embedding(input_dim=VOCAB_SIZE,
                    output_dim=EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))

model.add(SimpleRNN(64, return_sequences=True, activation='relu'))
model.add(SimpleRNN(64, activation='relu'))

model.add(Dense(5, activation='sigmoid'))

learning_rate = 0.001
optimizer = AdamW(learning_rate=learning_rate)

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=40, batch_size=20, validation_data=(X_val, y_val), verbose=0)


val_loss, val_accuracy = model.evaluate(X_val, y_val, batch_size=20)
#print(f'Validation Loss: {val_loss}')
#print(f'Validation Accuracy: {val_accuracy}')

test_url = 'https://raw.githubusercontent.com/debajyotimaz/nlp_assignment/main/test_split.csv'
test_data = pd.read_csv(test_url)

X_test = tokenizer.texts_to_sequences(test_data['text'])
X_test_padded = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

y_test = test_data[['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise']].values
y_pred = model.predict(X_test_padded)

y_pred_bin = (y_pred > 0.5).astype(int)

f1 = f1_score(y_test, y_pred_bin, average='macro')
print(f'Average Macro F1 Score: {f1}')



[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.3840 - loss: 0.8535
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step
Average Macro F1 Score: 0.2892302258191323
