<a href="https://colab.research.google.com/github/robitussin/CCDEPLRL_EXERCISES/blob/main/Exercise6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 6

In [61]:
!pip install scikit-learn



In [62]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import shuffle

In [63]:
import numpy as np
import pandas as pd

path = "https://github.com/robitussin/CCDEPLRL_EXERCISES/blob/9b8ac1c5683abecc144f0af47eb7cda0688e12b7/dataset/reviews.json?raw=true"

dataset = pd.read_json(path)

In [64]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,5
1,di pareha yong mga size nila may sobrang liit ...,5
2,super worth it ang ganda Sombra grabi order na...,5
3,ganda po salamat,5
4,maayos pagkadeliver maganda den sya,5


## 1. Tokenize the data

In [65]:
# Convert ratings to binary labels (adjust threshold as needed)
labels = (dataset.rating.values > 3).astype(int)

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(dataset.review)
word_index = tokenizer.word_index
print(f"{len(word_index)} words in dictionary")
print(f"First 10 words: {dict(list(word_index.items())[:10])}")



3537 words in dictionary
First 10 words: {'<OOV>': 1, 'na': 2, 'the': 3, 'ko': 4, 'i': 5, 'ang': 6, 'sa': 7, 'yung': 8, 'and': 9, 'ng': 10}


## 2. Sequence the data

In [66]:
sequences = tokenizer.texts_to_sequences(dataset.review)

max_length = 200  # Reduce max length to speed up training
padding_type = 'post'
trunc_type = 'post'



## 3. Pad the data

In [67]:
padded_sequences = pad_sequences(
    sequences,
    maxlen=max_length,
    padding=padding_type,
    truncating=trunc_type
)

labels = np.array(labels)  # ensure labels are numpy array

# Shuffle data using sklearn.utils.shuffle
padded_sequences, labels = shuffle(padded_sequences, labels, random_state=42)

## 4. Train a sentiment model

In [None]:
embedding_dim = 64
max_length = 200  # reduce max length to speed training
vocab_size = len(word_index) + 1

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),

    tf.keras.layers.Conv1D(128, 5, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    tf.keras.layers.MaxPooling1D(pool_size=2),

    tf.keras.layers.Conv1D(128, 5, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    tf.keras.layers.MaxPooling1D(pool_size=2),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-4)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    train_padded,      # make sure your padded sequences are maxlen=200
    train_labels,
    epochs=20,
    batch_size=64,     # larger batch size for faster training
    validation_data=(val_padded, val_labels),
    callbacks=[early_stop],
    verbose=1
)




Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 1s/step - accuracy: 0.4610 - loss: 1.0114 - val_accuracy: 0.4776 - val_loss: 0.9251
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 1s/step - accuracy: 0.5261 - loss: 0.9061 - val_accuracy: 0.4776 - val_loss: 0.8625
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 908ms/step - accuracy: 0.4753 - loss: 0.8521 - val_accuracy: 0.4776 - val_loss: 0.8226
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 976ms/step - accuracy: 0.4878 - loss: 0.8146 - val_accuracy: 0.4776 - val_loss: 0.7937
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 1s/step - accuracy: 0.4912 - loss: 0.7871 - val_accuracy: 0.4776 - val_loss: 0.7721
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 889ms/step - accuracy: 0.5056 - loss: 0.7658 - val_accuracy: 0.4776 - val_loss: 0.7550
Epoch 7/20


In [None]:
import matplotlib.pyplot as plt

# Plot Accuracy
plt.figure(figsize=(8,5))
plt.plot(history.history['accuracy'], label='Train Accuracy', color='blue')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='orange')
plt.title('Model Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot Loss
plt.figure(figsize=(8,5))
plt.plot(history.history['loss'], label='Train Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.title('Model Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## Get files for visualing the network

In [None]:


import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Get the embedding layer from the model
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
weights = embedding_layer.get_weights()[0]

# Write words and vectors to files
for word, i in tokenizer.word_index.items():
  vec = weights[i]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")

out_v.close()
out_m.close()
# ## Download files

try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

## 5. Predict sentiment with new reviews

In [None]:
# Predict sentiment for new Tagalog reviews
new_reviews = [
    "Ang Pangit ng service, hindi na ako uulit, bobo nyo tangina",  # Negative
    "Pakyu, tagal dumating ng order ko, hindi na ako bibili dito", # Neutral / Slightly Negative
    "Sakto lang"                                              # Positive
]

# Preprocess the new reviews
new_sequences = tokenizer.texts_to_sequences(new_reviews)
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding=padding_type, truncating=truncation_type)

# Predict the sentiment
predictions = model.predict(new_padded)

for i, review in enumerate(new_reviews):
    sentiment_score = predictions[i][0]
    if sentiment_score > 0.6:
        sentiment = "Positive"
    elif sentiment_score < 0.40:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
    print(f"Review: '{review}' -> Sentiment: {sentiment} (Score: {sentiment_score:.4f})")
