# **Import Library**

In [3]:
from nltk.corpus import stopwords # Daftar kata-kata berhenti dalam teks
from nltk.stem import PorterStemmer # Stemmer yang digunakan adalah Porter Stemmer.
from nltk.tokenize import word_tokenize # Tokenisasi teks
from sklearn.ensemble import RandomForestClassifier # Algoritma Random Forest untuk klasifikasi
from sklearn.feature_extraction.text import TfidfVectorizer # Konversi teks menjadi vektor TF-IDF
from sklearn.linear_model import LogisticRegression # Algoritma Logistic Regression untuk klasifikasi
from sklearn.metrics import accuracy_score # Metrik akurasi
from sklearn.model_selection import train_test_split # Membagi data menjadi set pelatihan dan pengujian
from sklearn.naive_bayes import BernoulliNB # Algoritma Naive Bayes untuk klasifikasi
from sklearn.tree import DecisionTreeClassifier # Algoritma Decision Tree untuk klasifikasi
from textblob import TextBlob # Pustaka untuk pemrosesan bahasa alami
from wordcloud import WordCloud # Membuat visualisasi berbentuk awan kata (word cloud) dari teks
import matplotlib.pyplot as plt # Matplotlib untuk visualisasi data
import nltk # Import pustaka NLTK (Natural Language Toolkit).
import numpy as np # NumPy untuk komputasi numerik
import pandas as pd # Pandas untuk manipulasi dan analisis data
import re # Modul untuk bekerja dengan ekspresi reguler
import seaborn as sns # Seaborn untuk visualisasi data statistik, mengatur gaya visualisasi
import string # Berisi konstanta string, seperti tanda baca
nltk.download('punkt')  # Mengunduh dataset yang diperlukan untuk tokenisasi teks.
nltk.download('stopwords')  # Mengunduh dataset yang berisi daftar kata-kata berhenti (stop words) dalam berbagai bahasa.
np.random.seed(0) # Mengatur seed untuk reproduktibilitas
pd.options.mode.chained_assignment = None # Menonaktifkan peringatan chaining

[nltk_data] Downloading package punkt to /home/bima/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/bima/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Loading Dataset**

In [4]:
balanced_df = pd.read_csv('balanced_reviews.csv') # Membaca data dari file CSV

## Preprocessing

In [5]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # remove mentions
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove hashtag
    text = re.sub(r'RT[\s]', '', text) # remove RT
    text = re.sub(r"http\S+", '', text) # remove link
    text = re.sub(r'[0-9]+', '', text) # remove numbers
    text = re.sub(r'[^\w\s]', '', text) # remove numbers
    text = text.replace('\n', ' ') # replace new line into space
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
    text = text.strip(' ') # remove characters space from both left and right text
    return text

def casefoldingText(text): # Converting all the characters in a text into lower case
    text = text.lower()
    return text

def tokenizingText(text): # Tokenizing or splitting a string, text into a list of tokens
    text = word_tokenize(text)
    return text

def filteringText(text): # Remove stopwors in a text
    listStopwords = set(stopwords.words('english'))
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(text): # Reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
    # Membuat objek stemmer
    stemmer = PorterStemmer()

    # Menerapkan stemming pada setiap kata dalam daftar
    stemmed_words = [stemmer.stem(word) for word in text]

    return stemmed_words

def toSentence(list_words): # Convert list of words into sentence
    sentence = ' '.join(word for word in list_words)
    return sentence

# **Skema 1: Logistic Regression + CountVectorizer**

## Feature Extraction (CountVectorizer)

In [6]:
balanced_df.head(1)

Unnamed: 0,content,text_clean,text_casefoldingText,text_tokenizingText,text_stopword,text_stemmingText,text_akhir,polarity,label
0,Game is very fun and helps you learn about pla...,Game is very fun and helps you learn about pla...,game is very fun and helps you learn about pla...,"['game', 'is', 'very', 'fun', 'and', 'helps', ...","['game', 'fun', 'helps', 'learn', 'playing', '...","['game', 'fun', 'help', 'learn', 'play', 'game...",game fun help learn play game real rig opinion...,0.088889,pos


In [7]:
# Create a mapping from label strings to integers
label_mapping = {'neg': 0, 'neutral': 1, 'pos': 2}
balanced_df['label'] = balanced_df['label'].map(label_mapping)

In [8]:
balanced_df.value_counts('label')

label
0    8000
1    8000
2    8000
Name: count, dtype: int64

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
balanced_df['text_akhir'] = balanced_df['text_akhir'].fillna('')

In [12]:
# Membuat objek CountVectorizer
count_vectorizer = CountVectorizer()

# Mengubah teks menjadi vektor
X = count_vectorizer.fit_transform(balanced_df['text_akhir'])

In [13]:
# Konversi hasil ekstraksi fitur menjadi dataframe
features_df = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names_out())

# Menampilkan hasil ekstraksi fitur
features_df

Unnamed: 0,aa,aaa,aaaaa,aaaaaaaand,aaaaaargh,aaaaand,aaaahaaaa,aaaand,aaaargh,aaand,...,𝚜𝚞𝚙𝚎𝚛𝚌𝚘𝚗𝚎𝚌𝚝𝚎𝚍,𝚝𝚑𝚊𝚝𝚜,𝚝𝚑𝚎,𝚝𝚑𝚒𝚜,𝚝𝚘,𝚠𝚊𝚜,𝚠𝚑𝚒𝚕𝚎,𝚠𝚒𝚕𝚕,𝚢𝚘𝚞,𝚢𝚘𝚞𝚛
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Data Splitting (7:3)

In [14]:
# Bagi data menjadi data latih dan data uji (70% data latih, 30% data uji)
X_train, X_test, y_train, y_test = train_test_split(X, balanced_df['label'], test_size=0.3, random_state=0)

# Menampilkan dimensi data latih dan data uji
print("Dimensi data latih:", X_train.shape, y_train.shape)
print("Dimensi data uji:", X_test.shape, y_test.shape)

Dimensi data latih: (16800, 16407) (16800,)
Dimensi data uji: (7200, 16407) (7200,)


## Model Training

In [15]:
%%time
# Membuat objek model Logistic Regression
logistic_regression = LogisticRegression()

# Melatih model Logistic Regression pada data pelatihan
logistic_regression.fit(X_train.toarray(), y_train)

# Prediksi sentimen pada data pelatihan dan data uji
y_pred_train_lr = logistic_regression.predict(X_train.toarray())
y_pred_test_lr = logistic_regression.predict(X_test.toarray())

# Evaluasi akurasi model Logistic Regression pada data pelatihan
accuracy_train_lr = accuracy_score(y_pred_train_lr, y_train)

# Evaluasi akurasi model Logistic Regression pada data uji
accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)

# Menampilkan akurasi
print('Logistic Regression - accuracy_train:', accuracy_train_lr)
print('Logistic Regression - accuracy_test:', accuracy_test_lr)

Logistic Regression - accuracy_train: 0.9844642857142857
Logistic Regression - accuracy_test: 0.9098611111111111
CPU times: user 3min 12s, sys: 28.7 s, total: 3min 41s
Wall time: 1min 7s


## Review Prediction

In [20]:
def review_predict(text):
  # Melakukan preprocessing pada kalimat baru
  kalimat_baru_cleaned = cleaningText(text)
  kalimat_baru_casefolded = casefoldingText(kalimat_baru_cleaned)
  kalimat_baru_tokenized = tokenizingText(kalimat_baru_casefolded)
  kalimat_baru_filtered = filteringText(kalimat_baru_tokenized)
  kalimat_baru_stemmed = stemmingText(kalimat_baru_filtered)
  kalimat_baru_final = toSentence(kalimat_baru_stemmed)

  # Menggunakan objek cv yang sudah di-fit dari pelatihan sebelumnya
  X_kalimat_baru = count_vectorizer.transform([kalimat_baru_final])

  # Memperoleh prediksi sentimen kalimat baru
  prediksi_sentimen_logistic_regression = logistic_regression.predict(X_kalimat_baru)

  # Menampilkan hasil prediksi
  print(prediksi_sentimen_logistic_regression)
  if prediksi_sentimen_logistic_regression == 0:
    print("Negative")
  elif prediksi_sentimen_logistic_regression == 1:
    print("Neutral")
  else:
    print("Positive")

In [21]:
kalimat_baru = '''One of the worst games I have ever played. It's like it's not even trying to hide the fact you play against AI. How I know this is that they always go for solids, never stripes, and when aiming have quick and spontaneous movements. Even when stripes has an easier ball to hit they still go for solids and 9 times out of 10 they don't have a profile picture. Another thing is after doing a spin in puts up the premium spin, I don't want or care about the premium spin so stop bringing it up.'''
review_predict(kalimat_baru)

[0]
Negative
