<a href="https://colab.research.google.com/github/nidjosep/student-feedback-analysis/blob/master/models/Model_2_Actionable_Insights_from_Student_Feedback_SA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import json
import re
import ssl
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding,
    LSTM,
    GRU,
    Dense,
    Dropout,
    Bidirectional,
)
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import drive


try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
# Load stop words
stop_words = set(stopwords.words('english'))

# Mount your Google Drive
drive.mount('/content/drive', force_remount=True)

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Teaching/TME_6015/Project/Emotion_final_combo.csv')

Mounted at /content/drive


In [29]:
# Load the list of stopwords
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def clean_text(text):
    whitelist = ['not']
    # Remove non-English characters
    text = ''.join([c for c in text if ord(c) < 128])

    # Remove numbers and special symbols, and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove stop words
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words or word.lower() in whitelist])

    return text

label_mapping = {
    "love": "pleased",
    "surprise": "happy",
    "fear": "frustrated",
    "anger": "frustrated",
    "anger": "frustrated",
    "sadness":"sad",
    "disgust":"sad",
}

# Create a new column 'Updated_Label' with the updated labels based on the mapping
def update_label(label):
    if label in label_mapping:
        return label_mapping[label]
    else:
        return label


texts = df['Review'].apply(clean_text)

labels = df['Updated_Label'] = df['Label'].apply(update_label)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
encoded_labels = label_encoder.fit_transform(labels)

# To see the mapping of string labels to integers
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping: ", label_mapping)

# Hyperparameters
vocab_size = 10000  # Number of unique words in the vocabulary
embedding_dim = 100  # Number of dimensions for each word vector
max_length = 100    # Max length of each input sequence
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"   # Token for out-of-vocabulary words

# Tokenize the text
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(padded, encoded_labels, test_size=0.2, random_state=42)

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(GRU(64, return_sequences=True)),
    Bidirectional(GRU(32)),
    Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(6, activation='softmax')
])

# Using Adam optimizer with a different learning rate
optimizer = Adam(learning_rate=0.0005)

model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

history = model.fit(train_texts, train_labels, epochs=20, batch_size=64, validation_data=(val_texts, val_labels), callbacks=[early_stopping])

Label Mapping:  {'angry': 0, 'frustrated': 1, 'happy': 2, 'pleased': 3, 'sad': 4}
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [30]:
def predict_sentiment(text):
    # Tokenize and pad the text
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

    # Predict
    prediction = model.predict(padded_sequence)
    sentiment = label_encoder.inverse_transform([np.argmax(prediction)])[0]

    return sentiment

# Example usage
sentence = "Valuing the constructive feedback on my work"  # Replace with your sentence
li =[sentence]
predicted_sentiment = predict_sentiment(remove_stop_words_from_list(li))
print(f"The predicted sentiment is: {predicted_sentiment}")

The predicted sentiment is: happy


In [31]:
# predict with test dataset
def remove_stop_words_from_list(texts):
    # Define a list of words to whitelist
    whitelist = ['not']
    processed_texts = []
    for text in texts:
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        processed_text = ' '.join([word for word in text.split() if word.lower() not in stop_words or word.lower() in whitelist])
        processed_texts.append(processed_text)
    return processed_texts

def read_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

test_data_dict = read_json('/content/drive/MyDrive/Teaching/TME_6015/Project/test_dataset_all_models.json')  # Make sure this path is correct

test_messages = list(test_data_dict.values())
test_messages = remove_stop_words_from_list(test_messages)

predicted_sentiments = {str(i+1): predict_sentiment(message) for i, message in enumerate(test_messages)}

for i, message in enumerate(test_messages):
    if( i>5):
      break
    message_num = i + 1
    predicted_feeling = predicted_sentiments[str(message_num)]
    print(f"Message {message_num}: {message}")  # Print the message
    print(f"Predicted Feeling: {predicted_feeling}\n")

# Write the dictionary to a JSON file
with open('/content/drive/MyDrive/Teaching/TME_6015/Project/emotions.json', 'w') as json_file:
    json.dump(predicted_sentiments, json_file, indent=4)

print("Sentiments written to emotions.json")

Message 1: Excited apply theoretical knowledge gained awesome lectures
Predicted Feeling: happy

Message 2: lecture fine lab schedule inconvenient inflexible
Predicted Feeling: happy

Message 3: Excited practical applications taught class
Predicted Feeling: happy

Message 4: Struggling disorganized course structure
Predicted Feeling: frustrated

Message 5: Valuing constructive feedback work
Predicted Feeling: pleased

Message 6: Pleased collaborative environment lab
Predicted Feeling: happy

Sentiments written to emotions.json
