<a href="https://colab.research.google.com/github/anjanadevi05/NLP_Assignment_2/blob/main/nlp_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Importing Dataset***

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv("/content/ISEAR.csv", header=None)
df = df.drop(columns=df.columns[2])

# Rename columns
df.columns = ['emotion', 'text']

df.head()


Unnamed: 0,emotion,text
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


# ***Preprocessing***
1. Lowercasing

2. Removing punctuation

3. Removing numbers

4. Removing stopwords

5. Tokenization

6. Lemmatization

7. Stemming

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary nltk resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation and digits
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(tokens)

# Apply preprocessing
df['Preprocessed_text'] = df['text'].apply(preprocess_text)

df[['emotion', 'Preprocessed_text']].head()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,emotion,Preprocessed_text
0,joy,day feel close partner friend feel peace also ...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treated possibility elucida...
3,sadness,think short time live relate period life think...
4,disgust,gathering found involuntarily sitting next two...


# ***Feature Engineering***
1. Tokenization + Integer Encoding
2. Padding Sequences
3. Label Encoding
4. Pre-trained Embeddings (like GloVe)

In [None]:
!pip install tensorflow.keras

Collecting tensorflow.keras
  Downloading tensorflow_keras-0.1-py3-none-any.whl.metadata (63 bytes)
Downloading tensorflow_keras-0.1-py3-none-any.whl (5.2 kB)
Installing collected packages: tensorflow.keras
Successfully installed tensorflow.keras-0.1


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Initialize tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['Preprocessed_text'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['Preprocessed_text'])

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Define max sequence length
max_length = max([len(seq) for seq in sequences])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Encode emotions to integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['emotion'])

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-04-18 03:45:03--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-04-18 03:45:03--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-04-18 03:45:04--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import numpy as np

# Load GloVe vectors
embedding_index = {}
with open("glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

print(f"Loaded {len(embedding_index)} word vectors from GloVe.")

# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


Loaded 400000 word vectors from GloVe.


# **BILSTM**

In [None]:
!pip install tensorflow.keras

In [None]:
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, encoded_labels, test_size=0.2, random_state=42)

In [None]:
model_bilstm = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_length,
              trainable=False),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model_bilstm.compile(loss='sparse_categorical_crossentropy',
                     optimizer='adam',
                     metrics=['accuracy'])

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history_bilstm = model_bilstm.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/20
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.2095 - loss: 1.9434 - val_accuracy: 0.3508 - val_loss: 1.6722
Epoch 2/20
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.3862 - loss: 1.6232 - val_accuracy: 0.4530 - val_loss: 1.4953
Epoch 3/20
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.4667 - loss: 1.4463 - val_accuracy: 0.4830 - val_loss: 1.4133
Epoch 4/20
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.4831 - loss: 1.3957 - val_accuracy: 0.5145 - val_loss: 1.3599
Epoch 5/20
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5127 - loss: 1.3050 - val_accuracy: 0.5129 - val_loss: 1.3185
Epoch 6/20
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5473 - loss: 1.2611 - val_accuracy: 0.5303 - val_loss: 1.3053
Epoch 7/20
[1m76/76[0m [32m━━━━

In [None]:
loss, accuracy = model_bilstm.evaluate(X_test, y_test, verbose=1)
print(f'Test Accuracy (BiLSTM): {accuracy * 100:.2f}%')

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5509 - loss: 1.2618
Test Accuracy (BiLSTM): 54.12%


# ***Logistic Regression***

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
tfidf = TfidfVectorizer(max_features=8000)
X_tfidf = tfidf.fit_transform(df['Preprocessed_text'])

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['emotion'])

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42)


In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [None]:
y_pred = lr_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(classification_report(
    y_test, y_pred,
    labels=range(len(le.classes_)),
    target_names=le.classes_))

Accuracy: 58.18%
              precision    recall  f1-score   support

       anger       0.50      0.46      0.48       227
     disgust       0.50      0.60      0.55       204
        fear       0.62      0.69      0.65       200
       guilt       0.56      0.48      0.51       209
        guit       0.00      0.00      0.00         0
         joy       0.69      0.78      0.73       233
     sadness       0.62      0.60      0.61       205
       shame       0.57      0.46      0.51       226

    accuracy                           0.58      1504
   macro avg       0.51      0.51      0.51      1504
weighted avg       0.58      0.58      0.58      1504



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
