# Data Preparation

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
import pandas as pd

# Load the dataset
dataset = pd.read_csv('data/gamis1.csv')


In [3]:
# Define sentiment thresholds
positive_threshold = 4
negative_threshold = 2

# Create a new column for sentiment labels
dataset['sentimen'] = ''


In [4]:
# Iterate through the dataset and assign sentiment labels
for index, row in dataset.iterrows():
    rating = row['rating']
    
    if rating >= positive_threshold:
        dataset.at[index, 'sentimen'] = 'positif'
    elif rating <= negative_threshold:
        dataset.at[index, 'sentimen'] = 'negatif'
    else:
        dataset.at[index, 'sentimen'] = 'netral'

In [5]:
# Save the updated dataset
dataset.to_csv('labeled_ds.csv', index=False)

In [6]:
dataset.head()

Unnamed: 0,review,rating,sentimen
0,makasih sudah sampai paketan nya ...mas kurir ...,5,positif
1,"Barang nya cpt nympe nya,bgs.\nmksh lazada.",5,positif
2,bahannya bagus. baju nya pas di pakai 👍,5,positif
3,alhmdllh brng udh nympe...alhmdllh jg bju ssua...,5,positif
4,bajunya cantik walaupun warnanya aga' beda den...,5,positif


# Text Preprocessing

In [7]:
!pip install sastrawi
import re
import nltk
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory



In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
def normalize_review(review):
    # Clean the text
    review = re.sub('[^a-zA-Z]', ' ', str(review))
    review = review.lower()  # Convert to lowercase

    # Tokenization
    tokens = word_tokenize(review)

    # Initialize Sastrawi stemmer and stopword remover
    stemmer = StemmerFactory().create_stemmer()
    stopwords = StopWordRemoverFactory().get_stop_words()

    # Normalize each token
    normalized_tokens = []
    for token in tokens:
        # Remove stopwords
        if token not in stopwords:
            # Stemming
            stemmed_token = stemmer.stem(token)
            normalized_tokens.append(stemmed_token)

    # Join the tokens back into a single string
    normalized_review = ' '.join(normalized_tokens)
    
    return normalized_review

In [None]:
dataset['review'] = dataset['review'].apply(normalize_review)

In [None]:
dataset.head()

In [None]:
# Save the updated dataset
dataset.to_csv('data/preprocessed.csv', index=False)


# Building Model

## Split dataset

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X = dataset['review']  # Input features (normalized reviews)
y = dataset['sentimen']  # Target variable (sentiment labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from gensim.models import Word2Vec

# Train a Word2Vec model on your corpus (X_train)
embedding_dim = 100  # Adjust the embedding dimension based on your requirements
embedding_model = Word2Vec(sentences=X_train, vector_size=embedding_dim, window=5, min_count=1, workers=4)

# Get word embeddings for your vocabulary
word_vectors = embedding_model.wv


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create a tokenizer to convert words to tokens
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text to sequences of tokens
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to have the same length
max_seq_length = 100  # Adjust the sequence length based on your data
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_seq_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_seq_length, padding='post')

# Convert sentiment labels to numerical values
sentiment_mapping = {'negatif': 0, 'netral': 1, 'positif': 2}
y_train_numerical = [sentiment_mapping[label] for label in y_train]
y_test_numerical = [sentiment_mapping[label] for label in y_test]


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Build the sentiment analysis model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, input_length=max_seq_length))
model.add(LSTM(64))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
import numpy as np

# Convert the input data to numpy arrays
X_train_padded = np.array(X_train_padded)
X_test_padded = np.array(X_test_padded)
y_train_numerical = np.array(y_train_numerical)
y_test_numerical = np.array(y_test_numerical)

# Train the sentiment analysis model
batch_size = 64
epochs = 10

history = model.fit(X_train_padded, y_train_numerical, batch_size=batch_size, epochs=epochs, validation_data=(X_test_padded, y_test_numerical))

# Access the training history
print(history.history.keys())

# Plot the training and validation accuracy over epochs
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Training', 'Validation'])
plt.show()

# Plot the training and validation loss over epochs
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Training', 'Validation'])
plt.show()


In [None]:
# Update the sentiment mapping dictionary to include all label indices
#sentiment_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
# Make predictions on the training data
train_predictions = model.predict(X_train_padded)
train_predicted_labels = [sentiment_mapping[np.argmax(pred)] for pred in train_predictions]

# Convert the true labels to sentiment labels
y_train_labels = [sentiment_mapping[label] for label in y_train_numerical]

# Calculate the accuracy
train_accuracy = sum(np.array(train_predicted_labels) == np.array(y_train_labels)) / len(y_train_labels)
print("Training Accuracy:", train_accuracy)


In [None]:
# Save the model
model.save("sentiment_analysis_model.h5")
