### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import re
from sklearn.preprocessing import MinMaxScaler
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df_review = pd.read_csv('/content/udemy_reviews.csv')

In [None]:
df_review

Unnamed: 0.1,Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,0,8044d915-fea5-477b-8723-62a2b18d45f2,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,All okay,5,0,9.41.2,2024-11-19 06:59:02,,,9.41.2
1,1,6721bb8a-f159-4797-9af6-08ee8d91cbbd,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Awesome app,5,0,9.42.1,2024-11-19 02:10:06,,,9.42.1
2,2,f699fcaa-0765-4301-9774-c34a3fd5f70e,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,great,5,0,9.42.1,2024-11-18 23:00:06,,,9.42.1
3,3,4387a7ed-e3d3-40da-9fd0-f74298bde24c,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"The app has worked well before, but right now,...",3,0,,2024-11-18 22:56:35,"Hi there,\n\nThanks for reaching out.\n\nPleas...",2024-11-18 23:14:27,
4,4,d02711d6-cd30-4153-9728-f3c2590a854b,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,interesting app easy to learn,5,0,9.42.0,2024-11-18 19:16:20,,,9.42.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6995,6995,edda15ad-94da-4609-8c8c-e74a0c2230b8,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"I'm finding it difficult to create an account,...",1,0,9.7.0,2023-05-16 21:51:10,Hello!\nThanks for sharing this.\nCan you plea...,2023-05-16 22:16:17,9.7.0
6996,6996,05f00450-033e-40ba-bde8-d7757c233ff4,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Please take a proper steps to casting to Andor...,1,0,9.7.0,2023-05-16 20:01:14,Hi Thiyagaraaj. Thanks for reaching out. Pleas...,2023-05-16 20:07:00,9.7.0
6997,6997,d5cca2f7-3d00-47c3-83da-d7c9bbba377d,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Nice course,4,0,,2023-05-16 17:30:51,,,
6998,6998,a02de8ed-7f88-41ea-8019-f22c60159c47,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,enjoying the app,5,0,9.7.0,2023-05-16 16:33:30,,,9.7.0


### Data Labeling

In [None]:
# download vader lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
# keep only necessary columns
data_cleaned = df_review[['content', 'score']].copy()

# drop empty rows
data_cleaned = data_cleaned.dropna(subset=['content'])

In [None]:
# define vader
sia = SentimentIntensityAnalyzer()

# text labeling
def get_sentiment_label(text):
    sentiment = sia.polarity_scores(text)
    compound = sentiment['compound']
    if compound >= 0.05:
        return 'Positive'
    elif compound <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# apply text labeling to the data
data_cleaned['sentiment'] = data_cleaned['content'].apply(get_sentiment_label)

# save it to csv as labeled data
data_cleaned.to_csv('labeled_sentiment.csv', index=False)

print(data_cleaned)

                                                content  score sentiment
0                                              All okay      5  Positive
1                                           Awesome app      5  Positive
2                                                 great      5  Positive
3     The app has worked well before, but right now,...      3  Negative
4                         interesting app easy to learn      5  Positive
...                                                 ...    ...       ...
6995  I'm finding it difficult to create an account,...      1  Negative
6996  Please take a proper steps to casting to Andor...      1  Positive
6997                                        Nice course      4  Positive
6998                                   enjoying the app      5  Positive
6999               very slow specially when downloading      3   Neutral

[7000 rows x 3 columns]


### Data Preprocessing

In [None]:
# download nltk features
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# data cleaning
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # remove link
    text = re.sub(r'\W|[\d]', ' ', text) # remove special character and number
    text = text.lower() # lowercasing
    text = re.sub(r'\s+', ' ', text).strip() # remove extra space
    return text

# apply data cleaning
data_cleaned['cleaned_content'] = data_cleaned['content'].apply(clean_text)

# remove stopword
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = text.split()
    # lemmatization and stopword removal
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data_cleaned['preprocessed_content'] = data_cleaned['cleaned_content'].apply(preprocess_text)

In [None]:
data_cleaned

Unnamed: 0,content,score,sentiment,cleaned_content,preprocessed_content
0,All okay,5,Positive,all okay,okay
1,Awesome app,5,Positive,awesome app,awesome app
2,great,5,Positive,great,great
3,"The app has worked well before, but right now,...",3,Negative,the app has worked well before but right now n...,app worked well right november something gone ...
4,interesting app easy to learn,5,Positive,interesting app easy to learn,interesting app easy learn
...,...,...,...,...,...
6995,"I'm finding it difficult to create an account,...",1,Negative,i m finding it difficult to create an account ...,finding difficult create account kept giving e...
6996,Please take a proper steps to casting to Andor...,1,Positive,please take a proper steps to casting to andor...,please take proper step casting andorid tv wor...
6997,Nice course,4,Positive,nice course,nice course
6998,enjoying the app,5,Positive,enjoying the app,enjoying app


In [None]:
# drop unnecessary columns
data_fix = data_cleaned[['preprocessed_content', 'sentiment']].copy()

In [None]:
# checking frequency of each value
data_fix['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Positive,4437
Negative,1451
Neutral,1112


In [29]:
# text vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(data_fix['preprocessed_content']).toarray()

# determine label
y = data_fix['sentiment']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

### Data Modeling

In [30]:
# modeling Machine Learning

# define the models
models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced", max_iter=100),
    "Random Forest": RandomForestClassifier(class_weight="balanced", n_estimators=10, random_state=35),
    "SVM": SVC(kernel='linear', probability=True, random_state=36),
}

# train and evaluation
results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_report = classification_report(y_train, y_train_pred, target_names=['Negative', 'Neutral', 'Positive'])

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_report = classification_report(y_test, y_test_pred, target_names=['Negative', 'Neutral', 'Positive'])

    results[model_name] = {
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Train Report": train_report,
        "Test Report": test_report
    }

# results display
for model_name, result in results.items():
    print(f"\nModel: {model_name}")
    print(f"Train Accuracy: {result['Train Accuracy']:.2f}")
    print(f"Test Accuracy: {result['Test Accuracy']:.2f}")
    print(f"Train Classification Report:\n{result['Train Report']}")
    print(f"Test Classification Report:\n{result['Test Report']}")

Training Logistic Regression...
Training Random Forest...
Training SVM...

Model: Logistic Regression
Train Accuracy: 0.87
Test Accuracy: 0.82
Train Classification Report:
              precision    recall  f1-score   support

    Negative       0.74      0.90      0.82      1241
     Neutral       0.72      0.92      0.81       951
    Positive       0.98      0.84      0.90      3758

    accuracy                           0.87      5950
   macro avg       0.81      0.89      0.84      5950
weighted avg       0.89      0.87      0.87      5950

Test Classification Report:
              precision    recall  f1-score   support

    Negative       0.66      0.79      0.72       210
     Neutral       0.61      0.83      0.71       161
    Positive       0.96      0.82      0.89       679

    accuracy                           0.82      1050
   macro avg       0.75      0.81      0.77      1050
weighted avg       0.85      0.82      0.83      1050


Model: Random Forest
Train Accuracy: 

Since the accuracy in validation data is not reaching 85% for each model that we use, then we will try to use LSTM instead

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import Callback
import keras

In [None]:
# set hyperparameters
max_words = 10000
max_len = 100
embedding_dim = 100
lstm_units = 36

texts = data_fix['preprocessed_content']

# tokenization of the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# pad the sequences so they all have the same length
x = pad_sequences(sequences, maxlen=max_len)

y = data_fix['sentiment']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

# encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# one-hot encode the labels for training and testing
y_train = keras.utils.to_categorical(y_train_encoded, num_classes=3)
y_test = keras.utils.to_categorical(y_test_encoded, num_classes=3)

# build model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(lstm_units)))
model.add(Dropout(0.3))
model.add(Dense(3, activation='softmax'))

# compile model
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])



In [None]:
# define callback early stop
class AccuracyThreshold(Callback):
    def __init__(self, threshold=0.85):
        super().__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        val_accuracy = logs.get('val_accuracy')
        if val_accuracy >= self.threshold:
            print(f"\nValidation accuracy reached {self.threshold * 100}%, stopping training.")
            self.model.stop_training = True

accuracy_threshold = AccuracyThreshold(threshold=0.85)

# model fit
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test), callbacks=[accuracy_threshold])

# model evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

Epoch 1/10
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 151ms/step - accuracy: 0.6496 - loss: 0.8303 - val_accuracy: 0.7943 - val_loss: 0.5046
Epoch 2/10
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 115ms/step - accuracy: 0.8329 - loss: 0.4324 - val_accuracy: 0.8210 - val_loss: 0.4601
Epoch 3/10
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.9077 - loss: 0.2836
Validation accuracy reached 85.0%, stopping training.
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 104ms/step - accuracy: 0.9078 - loss: 0.2835 - val_accuracy: 0.8543 - val_loss: 0.4402
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.8670 - loss: 0.3930
Test Loss: 0.4402, Test Accuracy: 0.8543


In [None]:
# save the model
model.save('trained_model.keras')

In [None]:
# preprocessing test data
sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=max_len)

# test the model using test data
x_test_sample = x_test[:5]
y_test_sample = y_test[:5]

# make prediction
predictions = model.predict(x_test_sample)

# convert class probability into labels
predicted_classes = predictions.argmax(axis=1)

# convert class to actual label
predicted_labels = label_encoder.inverse_transform(predicted_classes)

print("Predicted Labels:", predicted_labels)
true_labels = label_encoder.inverse_transform(y_test_sample.argmax(axis=1))
print("True Labels:", true_labels)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step
Predicted Labels: ['Positive' 'Positive' 'Positive' 'Negative' 'Positive']
True Labels: ['Positive' 'Neutral' 'Positive' 'Positive' 'Positive']


From 5 total samples, we got 3 of them right