<a href="https://colab.research.google.com/github/albertocj1/CCDEPLRL_EXERCISES_COM221/blob/main/Exercise6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 6

In [4]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
import numpy as np
import pandas as pd

path = "https://github.com/robitussin/CCDEPLRL_EXERCISES/blob/9b8ac1c5683abecc144f0af47eb7cda0688e12b7/dataset/reviews.json?raw=true"

dataset = pd.read_json(path)

In [6]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,5
1,di pareha yong mga size nila may sobrang liit ...,5
2,super worth it ang ganda Sombra grabi order na...,5
3,ganda po salamat,5
4,maayos pagkadeliver maganda den sya,5


In [7]:
dataset['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
1,299
5,249
4,249
3,123
2,81


In [8]:
dataset.rename(columns={'rating': 'sentiment'}, inplace=True)

def convert_rating_to_sentiment(rating):
  if rating >= 3:
    return 1  # Positive
  else:
    return 0  # Negative

dataset['sentiment'] = dataset['sentiment'].apply(convert_rating_to_sentiment)

dataset.head()
dataset.info()
dataset['sentiment'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1001 non-null   object
 1   sentiment  1001 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,621
0,380


In [9]:
dataset.head()

Unnamed: 0,review,sentiment
0,sir okay armygreen shorts nice,1
1,di pareha yong mga size nila may sobrang liit ...,1
2,super worth it ang ganda Sombra grabi order na...,1
3,ganda po salamat,1
4,maayos pagkadeliver maganda den sya,1


## 1. Tokenize the data

In [10]:
vocab_size = 5000
oov_token = ""

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(dataset['review'])

## 2. Sequence the data

In [11]:
sequences = tokenizer.texts_to_sequences(dataset['review'])

In [12]:
max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

## 3. Pad the data

In [13]:
max_length = 50
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')


## 4. Train a sentiment model

In [14]:
X = padded_sequences
y = np.array(dataset['sentiment'])

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 32, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=80, validation_data=(X_val, y_val))





Epoch 1/80
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.5829 - loss: 0.6844 - val_accuracy: 0.6219 - val_loss: 0.6680
Epoch 2/80
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6155 - loss: 0.6626 - val_accuracy: 0.6219 - val_loss: 0.6642
Epoch 3/80
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6386 - loss: 0.6518 - val_accuracy: 0.6169 - val_loss: 0.6550
Epoch 4/80
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6523 - loss: 0.6351 - val_accuracy: 0.6318 - val_loss: 0.6378
Epoch 5/80
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6808 - loss: 0.6139 - val_accuracy: 0.6368 - val_loss: 0.6173
Epoch 6/80
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7202 - loss: 0.5800 - val_accuracy: 0.6667 - val_loss: 0.5859
Epoch 7/80
[1m25/25[0m [32m━━━━━━━━━

## Get files for visualing the network

In [15]:

# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(5000, 32)


## 5. Predict sentiment with new reviews

In [16]:
new_reviews = [
    "ang panget ng order ko",
    "ang ganda ng nabili kong buhok",
    "sakto sakin yung shirt na XXXL",
]

new_sequences = tokenizer.texts_to_sequences(new_reviews)
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding='post', truncating='post')

predictions = model.predict(new_padded)

for i, review in enumerate(new_reviews):
    sentiment = "Positive" if predictions[i] >= 0.5 else "Negative"
    print(f"Review: {review}\nSentiment: {sentiment} (Confidence: {predictions[i][0]:.2f})\n")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step
Review: ang panget ng order ko
Sentiment: Negative (Confidence: 0.00)

Review: ang ganda ng nabili kong buhok
Sentiment: Positive (Confidence: 0.91)

Review: sakto sakin yung shirt na XXXL
Sentiment: Positive (Confidence: 0.97)

