**<h1>MISC</h1>**

In [None]:
pip install vaderSentiment

In [None]:
pip install afinn

In [None]:
pip install textblob

In [None]:
pip install scikit-learn

**<h1>IMPORTS</h1>**

In [1]:
import random
import emoji
import pandas as pd
import numpy as np
import tensorflow as tf

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from afinn import Afinn
from nltk.corpus import wordnet

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from tensorflow.keras.utils import pad_sequences
from tensorflow.keras import layers, models
from tensorflow.keras.models import load_model

**<h1>DATASETS</h1>**

In [None]:
df = pd.read_csv('/kaggle/input/datasets/choonkhonng/malaysia-restaurant-review-datasets/Malaysia Restaurant Review Datasets/data_cleaned/TripAdvisor_data_cleaned.csv')
df.head(5)

In [None]:
df2.columns

In [None]:
df_new = pd.read_csv('/kaggle/input/datasets/choonkhonng/malaysia-restaurant-review-datasets/Malaysia Restaurant Review Datasets/data_cleaned/TripAdvisor_data_cleaned.csv',
                      usecols=['Title', 'Review', 'Restaurant'])
df_new.head(5)

In [None]:
df_clean = df_new.dropna()
print(len(df_clean))

In [None]:
regex = r'[^a-zA-Z0-9\s]'

df_clean['Review'] = df_clean['Review'].str.replace(regex, '', regex=True)

In [None]:
df_clean['Review'] = df_clean['Review'].apply(lambda text: ''.join(c for c in text if c not in emoji.EMOJI_DATA))

In [None]:
df_clean['Review'] = df_clean['Review'].str.strip()
df_clean['Review'] = df_clean['Review'].str.lower()

In [None]:
df_clean['Caption'][2]

In [None]:
df['Caption'][2]

**<h3>Labeling</h3>**

In [2]:
# TextBlob
def textblob_sentiment(text):
    blob = TextBlob(str(text))
    polarity = blob.sentiment.polarity
    if polarity >= 0:
        return 'positive'
    else:
        return 'negative'

In [3]:
# Vader
def vader_sentiment(text):
    scores = SentimentIntensityAnalyzer().polarity_scores(str(text))
    compound = scores['compound']
    if compound >= 0:
        return 'positive'
    else:
        return 'negative'

In [4]:
# AFINN
def afinn_sentiment(text):
    scores = Afinn().score(str(text))
    if scores >= 0:
        return 'positive'
    else:
        return 'negative'

In [None]:
df_clean['TextBlob_Label'] = df_clean['Review'].apply(textblob_sentiment)
df_clean['Vader_Label'] = df_clean['Review'].apply(vader_sentiment)
df_clean['Afinn_Label'] = df_clean['Review'].apply(afinn_sentiment)

In [None]:
print(df_clean['TextBlob_Label'].value_counts())
print(df_clean['Vader_Label'].value_counts())
print(df_clean['Afinn_Label'].value_counts())

In [None]:
df_clean.to_csv('/kaggle/working/sentiment_results.csv', index=False)

**<h3>Synonym Augmentation</h3>**

In [2]:
df = pd.read_csv('./sentiment_results.csv')
df.head(2)

Unnamed: 0,Title,Review,Restaurant,TextBlob_Label,Vader_Label,Afinn_Label
0,Chambers Bar and Grill,david thanga mikail and chef steven gave brill...,Chambers Grill,positive,positive,positive
1,Always a pleasure. Place to celebrate.,we visited for family celebration and as usual...,Chambers Grill,positive,positive,positive


In [5]:
def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    
    num_replaced = 0
    for word in random_word_list:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym_words = set()
            for syn in synonyms:
                for lemma in syn.lemmas():
                    synonym_words.add(lemma.name())
            synonym_words.discard(word)
            if synonym_words:
                synonym = random.choice(list(synonym_words))
                new_words = [synonym if w == word else w for w in new_words]
                num_replaced += 1
            if num_replaced >= n:
                break
    return " ".join(new_words)

In [6]:
def balance_sentiment(df, sentiment_col, text_col):
    counts = df[sentiment_col].value_counts()
    min_class = counts.idxmin()
    max_class = counts.idxmax()
    diff = counts[max_class] - counts[min_class]

    minority_df = df[df[sentiment_col] == min_class]

    augmented_rows = minority_df.sample(diff, replace=True).copy()
    augmented_rows[text_col] = augmented_rows[text_col].apply(lambda x: synonym_replacement(x, n=2))

    balanced_df = pd.concat([df, augmented_rows], ignore_index=True)
    return balanced_df

In [5]:
balanced_textblob = balance_sentiment(df, 'TextBlob_Label', 'Review')
balanced_vader = balance_sentiment(df, 'Vader_Label', 'Review')
balanced_afinn = balance_sentiment(df, 'Afinn_Label', 'Review')

In [6]:
print(balanced_textblob['TextBlob_Label'].value_counts())
print(balanced_vader['Vader_Label'].value_counts())
print(balanced_afinn['Afinn_Label'].value_counts())

TextBlob_Label
positive    129887
negative    129887
Name: count, dtype: int64
Vader_Label
positive    130348
negative    130348
Name: count, dtype: int64
Afinn_Label
positive    132981
negative    132981
Name: count, dtype: int64


**<h1>TOKENIZATION</h1>**

In [7]:
vectorizer = CountVectorizer(max_features=5000)

In [8]:
# Bag of Words
tb_bow = balanced_textblob['Review'].astype(str).tolist()
vader_bow = balanced_vader['Review'].astype(str).tolist()
afinn_bow = balanced_afinn['Review'].astype(str).tolist()

In [9]:
X_tb_bow = vectorizer.fit_transform(tb_bow).toarray()
X_vader_bow = vectorizer.fit_transform(vader_bow).toarray()
X_afinn_bow = vectorizer.fit_transform(afinn_bow).toarray()

In [10]:
balanced_textblob['TextBlob_Label'] = balanced_textblob['TextBlob_Label'].replace({'positive': 1, 'negative': 0})
balanced_vader['Vader_Label'] = balanced_vader['Vader_Label'].replace({'positive': 1, 'negative': 0})
balanced_afinn['Afinn_Label'] = balanced_afinn['Afinn_Label'].replace({'positive': 1, 'negative': 0})

  balanced_textblob['TextBlob_Label'] = balanced_textblob['TextBlob_Label'].replace({'positive': 1, 'negative': 0})
  balanced_vader['Vader_Label'] = balanced_vader['Vader_Label'].replace({'positive': 1, 'negative': 0})
  balanced_afinn['Afinn_Label'] = balanced_afinn['Afinn_Label'].replace({'positive': 1, 'negative': 0})


In [11]:
y_tb_bow = LabelEncoder().fit_transform(balanced_textblob['TextBlob_Label'])
y_vader_bow = LabelEncoder().fit_transform(balanced_vader['Vader_Label'])
y_afinn_bow = LabelEncoder().fit_transform(balanced_afinn['Afinn_Label'])

In [12]:
X_tb_bow_seq = pad_sequences(X_tb_bow, maxlen=150, padding='post', truncating='post', value=0)
X_vader_bow_seq = pad_sequences(X_vader_bow, maxlen=150, padding='post', truncating='post', value=0)
X_afinn_bow_seq = pad_sequences(X_afinn_bow, maxlen=150, padding='post', truncating='post', value=0)

**<h1>TRAINING</h1>**

**<h3>CNN VADER</h3>**

In [13]:
model = models.Sequential([
    layers.Embedding(input_dim=5000, output_dim=100, input_length=150),
    layers.Conv1D(1024, 3, padding='valid', activation='relu', strides=1),
    layers.GlobalMaxPooling1D(),
    
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(2048, activation='relu'),
    
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(1, activation='sigmoid') 
])

In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 100)          500000    
                                                                 
 conv1d (Conv1D)             (None, 148, 1024)         308224    
                                                                 
 global_max_pooling1d (Globa  (None, 1024)             0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 batch_normalization (BatchN  (None, 1024)             4096      
 ormalization)                                                   
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0

In [15]:
model.fit(X_vader_bow_seq, y_vader_bow, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1e3d08a6bc0>

In [17]:
model.save('./demo/cnn_vader_bow_50.keras')

In [18]:
loaded_model = load_model('./demo/cnn_vader_bow_50.keras') 

**<h3>CNN AFINN</h3>**

In [13]:
model_afinn = models.Sequential([
    layers.Embedding(input_dim=5000, output_dim=100, input_length=150),
    layers.Conv1D(1024, 3, padding='valid', activation='relu', strides=1),
    layers.GlobalMaxPooling1D(),
    
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(2048, activation='relu'),
    
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(1, activation='sigmoid') 
])

In [14]:
model_afinn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_afinn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 100)          500000    
                                                                 
 conv1d (Conv1D)             (None, 148, 1024)         308224    
                                                                 
 global_max_pooling1d (Globa  (None, 1024)             0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 batch_normalization (BatchN  (None, 1024)             4096      
 ormalization)                                                   
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0

In [15]:
model_afinn.fit(X_afinn_bow_seq, y_afinn_bow, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x18d9c366a10>

In [16]:
model_afinn.save('./demo/cnn_afinn_bow_50.keras')

In [None]:
loaded_model = load_model('./demo/cnn_afinn_bow_50.keras') 

**<h3>CNN TEXTBLOB</h3>**

In [13]:
model_tb = models.Sequential([
    layers.Embedding(input_dim=5000, output_dim=100, input_length=150),
    layers.Conv1D(1024, 3, padding='valid', activation='relu', strides=1),
    layers.GlobalMaxPooling1D(),
    
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(2048, activation='relu'),
    
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    
    layers.Dense(1, activation='sigmoid') 
])

In [14]:
model_tb.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_tb.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 100)          500000    
                                                                 
 conv1d (Conv1D)             (None, 148, 1024)         308224    
                                                                 
 global_max_pooling1d (Globa  (None, 1024)             0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 batch_normalization (BatchN  (None, 1024)             4096      
 ormalization)                                                   
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0

In [15]:
model_tb.fit(X_tb_bow_seq, y_tb_bow, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x210b7f937c0>

In [16]:
model_tb.save('./demo/cnn_tb_bow_50.keras')

**<h1>MODEL EVALUATION</h1>**

In [8]:
df_test = pd.read_csv('./TRIPADVISOR_FOOD_SENTIMENTS_SARAWAK.csv', encoding='latin1', usecols=['Restaurant_Name', 'Review_Title', 'Caption'])
df_test.head(5)

Unnamed: 0,Restaurant_Name,Review_Title,Caption
0,Balkanico,Lovely atmosphere and great pizza,Lovely atmosphere and great pizza We had a pep...
1,Balkanico,"Great food, relaxed atmosphere",The food was delicious and the service good Ju...
2,Balkanico,Great thin crust pizza,This isnÃ¢ÂÂt really a restaurant more of a ...
3,Balkanico,Nice pizza and Tuak,The cafe is opposite a Chinese temle it is act...
4,Balkanico,Excellent,I had the beef pepperoni pizza (25RM) with fri...


In [9]:
df_test_clean = df_test.dropna()
print(len(df_test_clean))

1444


In [10]:
regex = r'[^a-zA-Z0-9\s]'

df_test_clean['Caption'] = df_test_clean['Caption'].str.replace(regex, '', regex=True)

In [11]:
df_test_clean['Caption'] = df_test_clean['Caption'].apply(lambda text: ''.join(c for c in text if c not in emoji.EMOJI_DATA))

In [12]:
df_test_clean['Caption'] = df_test_clean['Caption'].str.strip()
df_test_clean['Caption'] = df_test_clean['Caption'].str.lower()

In [13]:
df_test_clean['Vader_Label'] = df_test_clean['Caption'].apply(vader_sentiment)
df_test_clean['Afinn_Label'] = df_test_clean['Caption'].apply(afinn_sentiment)
df_test_clean['TextBlob_Label'] = df_test_clean['Caption'].apply(textblob_sentiment)

In [14]:
print(df_test_clean['TextBlob_Label'].value_counts())
print(df_test_clean['Vader_Label'].value_counts())
print(df_test_clean['Afinn_Label'].value_counts())

TextBlob_Label
positive    1346
negative      98
Name: count, dtype: int64
Vader_Label
positive    1350
negative      94
Name: count, dtype: int64
Afinn_Label
positive    1377
negative      67
Name: count, dtype: int64


In [15]:
balanced_test_textblob = balance_sentiment(df_test_clean, 'TextBlob_Label', 'Caption')
balanced_test_vader = balance_sentiment(df_test_clean, 'Vader_Label', 'Caption')
balanced_test_afinn = balance_sentiment(df_test_clean, 'Afinn_Label', 'Caption')

In [16]:
print(balanced_test_textblob['TextBlob_Label'].value_counts())
print(balanced_test_vader['Vader_Label'].value_counts())
print(balanced_test_afinn['Afinn_Label'].value_counts())

TextBlob_Label
positive    1346
negative    1346
Name: count, dtype: int64
Vader_Label
positive    1350
negative    1350
Name: count, dtype: int64
Afinn_Label
positive    1377
negative    1377
Name: count, dtype: int64


In [17]:
# Bag of Words
test_tb_bow = balanced_test_textblob['Caption'].astype(str).tolist()
test_vader_bow = balanced_test_vader['Caption'].astype(str).tolist()
test_afinn_bow = balanced_test_afinn['Caption'].astype(str).tolist()

In [18]:
X_test_tb_bow = vectorizer.fit_transform(test_tb_bow).toarray()
X_test_vader_bow = vectorizer.fit_transform(test_vader_bow).toarray()
X_test_afinn_bow = vectorizer.fit_transform(test_afinn_bow).toarray()

In [19]:
balanced_test_textblob['TextBlob_Label'] = balanced_test_textblob['TextBlob_Label'].replace({'positive': 1, 'negative': 0})
balanced_test_vader['Vader_Label'] = balanced_test_vader['Vader_Label'].replace({'positive': 1, 'negative': 0})
balanced_test_afinn['Afinn_Label'] = balanced_test_afinn['Afinn_Label'].replace({'positive': 1, 'negative': 0})

  balanced_test_textblob['TextBlob_Label'] = balanced_test_textblob['TextBlob_Label'].replace({'positive': 1, 'negative': 0})
  balanced_test_vader['Vader_Label'] = balanced_test_vader['Vader_Label'].replace({'positive': 1, 'negative': 0})
  balanced_test_afinn['Afinn_Label'] = balanced_test_afinn['Afinn_Label'].replace({'positive': 1, 'negative': 0})


In [20]:
y_test_tb_bow = LabelEncoder().fit_transform(balanced_test_textblob['TextBlob_Label'])
y_test_vader_bow = LabelEncoder().fit_transform(balanced_test_vader['Vader_Label'])
y_test_afinn_bow = LabelEncoder().fit_transform(balanced_test_afinn['Afinn_Label'])

In [21]:
X_test_tb_bow_seq = pad_sequences(X_test_tb_bow, maxlen=150, padding='post', truncating='post', value=0)
X_test_vader_bow_seq = pad_sequences(X_test_vader_bow, maxlen=150, padding='post', truncating='post', value=0)
X_test_afinn_bow_seq = pad_sequences(X_test_afinn_bow, maxlen=150, padding='post', truncating='post', value=0)

**<h3>CNN VADER EVALUATION</h3>**

In [22]:
loaded_model_vader = load_model('./demo/cnn_vader_bow_50.keras') 

In [23]:
y_pred_proba_vader = loaded_model_vader.predict(X_test_vader_bow_seq)
y_pred_vader = (y_pred_proba_vader > 0.5).astype(int).flatten()



In [24]:
f1 = f1_score(y_test_vader_bow, y_pred_vader, average='binary')
accuracy = accuracy_score(y_test_vader_bow, y_pred_vader)
precision =precision_score(y_test_vader_bow, y_pred_vader)
recall = recall_score(y_test_vader_bow, y_pred_vader)

loaded_model_vader = {
    'f1_score': f1,
    'accuracy': accuracy,
    'predictions': y_pred_vader,
    'precision': y_pred_vader,
    'recall': y_pred_vader,
    'true_labels': y_test_vader_bow
}

print('VADER Results:')
print('F1:', f1)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)

VADER Results:
F1: 0.6678235002478929
Accuracy: 0.5037037037037037
Precision: 0.5018628912071535
Recall: 0.9977777777777778


**<h3>CNN AFINN EVALUATION</h3>**

In [25]:
loaded_model_afinn = load_model('./demo/cnn_afinn_bow_50.keras') 

In [26]:
y_pred_proba_afinn = loaded_model_afinn.predict(X_test_afinn_bow_seq)
y_pred_afinn = (y_pred_proba_afinn > 0.5).astype(int).flatten()



In [27]:
f1 = f1_score(y_test_afinn_bow, y_pred_afinn, average='binary')
accuracy = accuracy_score(y_test_afinn_bow, y_pred_afinn)
precision =precision_score(y_test_afinn_bow, y_pred_afinn)
recall = recall_score(y_test_afinn_bow, y_pred_afinn)

loaded_model_vader = {
    'f1_score': f1,
    'accuracy': accuracy,
    'predictions': y_pred_afinn,
    'precision': y_pred_afinn,
    'recall': y_pred_afinn,
    'true_labels': y_test_afinn_bow
}

print('AFINN Results:')
print('F1:', f1)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)

AFINN Results:
F1: 0.665374363944754
Accuracy: 0.49854756717501814
Precision: 0.49927272727272726
Recall: 0.9970951343500363


**<h3>CNN TEXTBLOB EVALUATION</h3>**

In [28]:
loaded_model_tb = load_model('./demo/cnn_tb_bow_50.keras') 

In [29]:
y_pred_proba_tb = loaded_model_tb.predict(X_test_tb_bow_seq)
y_pred_tb = (y_pred_proba_tb > 0.5).astype(int).flatten()



In [30]:
f1 = f1_score(y_test_tb_bow, y_pred_tb, average='binary')
accuracy = accuracy_score(y_test_tb_bow, y_pred_tb)
precision =precision_score(y_test_tb_bow, y_pred_tb)
recall = recall_score(y_test_tb_bow, y_pred_tb)

loaded_model_vader = {
    'f1_score': f1,
    'accuracy': accuracy,
    'predictions': y_pred_tb,
    'precision': y_pred_tb,
    'recall': y_pred_tb,
    'true_labels': y_test_tb_bow
}

print('TextBlob Results:')
print('F1:', f1)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)

TextBlob Results:
F1: 0.6653445711452652
Accuracy: 0.4985141158989599
Precision: 0.4992559523809524
Recall: 0.9970282317979198
